Impact of Environmental, Social And Governance News on the Stock Market Indices

In [1]:
#Import packages
import numpy as np
!pip install pandas
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
!pip install pandas-datareader
import pandas_datareader
import datetime
import pandas_datareader.data as web
import requests

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
!pip install TextBlob
from textblob import TextBlob

from pandas_datareader import data as pdr
!pip install yfinance
import yfinance

import scipy.stats as stats
import nltk

from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
nltk.download('stopwords')

!pip install wordcloud
from wordcloud import WordCloud
import re

#Use VADER
!pip install vadersentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import os

!pip install feedparser
import feedparser as fp
import json

!pip install newspaper3k
import newspaper
from newspaper import Article
from time import mktime

from datetime import datetime, timedelta
import time
import pprint

%matplotlib inline

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from wordcloud import WordCloud, STOPWORDS,ImageColorGenerator
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer
from textblob import TextBlob
from PIL import Image
from os import path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import os

!pip install feedparser
import feedparser as fp
import json

!pip install newspaper3k
import newspaper
from newspaper import Article
from time import mktime

from datetime import datetime, timedelta
import time
import pprint

import nltk
nltk.download('vader_lexicon')

import json
import requests
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta

import glob

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
Requirement already satisfied: pandas in c:\programdata\anaconda3\lib\site-packages (1.0.3)
Requirement already satisfied: pytz>=2017.2 in c:\programdata\anaconda3\lib\site-packages (from pandas) (2020.1)
Requirement already satisfied: python-dateutil>=2.6.1 in c:\programdata\anaconda3\lib\site-packages (from pandas) (2.8.1)
Requirement already satisfied: numpy>=1.13.3 in c:\programdata\anaconda3\lib\site-packages (from pandas) (1.18.3)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.6.1->pandas) (1.14.0)
Requirement already satisfied: pandas-datareader in c:\programdata\anaconda3\lib\site-packages (0.8.1)
Requirement already satisfied: requests>=2.3.0 in c:\programdata\anaconda3\lib\site-packages (from pandas-datareader) (2.23.0)
Requirement already satisfied: pandas>=0.21 in c:\programdata\anaconda3\lib\site-packages (from pandas-datareader) (1.0.3)
Requirement already satisfied: lxml in c:\programdata\anaconda3\lib\site-packages (from pandas-datareader) (4.5.0)
Requirement already satisfied: certifi>=2017.4.17 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.3.0->pandas-datareader) (2020.4.5.1)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.3.0->pandas-datareader) (1.25.8)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.3.0->pandas-datareader) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.3.0->pandas-datareader) (2.9)
Requirement already satisfied: python-dateutil>=2.6.1 in c:\programdata\anaconda3\lib\site-packages (from pandas>=0.21->pandas-datareader) (2.8.1)
Requirement already satisfied: numpy>=1.13.3 in c:\programdata\anaconda3\lib\site-packages (from pandas>=0.21->pandas-datareader) (1.18.3)
Requirement already satisfied: pytz>=2017.2 in c:\programdata\anaconda3\lib\site-packages (from pandas>=0.21->pandas-datareader) (2020.1)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.6.1->pandas>=0.21->pandas-datareader) (1.14.0)
C:\ProgramData\Anaconda3\lib\site-packages\pandas_datareader\compat\__init__.py:7: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  from pandas.util.testing import assert_frame_equal
Requirement already satisfied: TextBlob in c:\programdata\anaconda3\lib\site-packages (0.15.3)
Requirement already satisfied: nltk>=3.1 in c:\programdata\anaconda3\lib\site-packages (from TextBlob) (3.5)
Requirement already satisfied: regex in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.1->TextBlob) (2020.5.7)
Requirement already satisfied: click in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.1->TextBlob) (7.1.2)
Requirement already satisfied: joblib in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.1->TextBlob) (0.14.1)
Requirement already satisfied: tqdm in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.1->TextBlob) (4.46.0)
Requirement already satisfied: yfinance in c:\programdata\anaconda3\lib\site-packages (0.1.54)
Requirement already satisfied: multitasking>=0.0.7 in c:\programdata\anaconda3\lib\site-packages (from yfinance) (0.0.9)
Requirement already satisfied: requests>=2.20 in c:\programdata\anaconda3\lib\site-packages (from yfinance) (2.23.0)
Requirement already satisfied: numpy>=1.15 in c:\programdata\anaconda3\lib\site-packages (from yfinance) (1.18.3)
Requirement already satisfied: pandas>=0.24 in c:\programdata\anaconda3\lib\site-packages (from yfinance) (1.0.3)
Requirement already satisfied: certifi>=2017.4.17 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.20->yfinance) (2020.4.5.1)
Requirement already satisfied: idna<3,>=2.5 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.20->yfinance) (2.9)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.20->yfinance) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.20->yfinance) (1.25.8)
Requirement already satisfied: pytz>=2017.2 in c:\programdata\anaconda3\lib\site-packages (from pandas>=0.24->yfinance) (2020.1)
Requirement already satisfied: python-dateutil>=2.6.1 in c:\programdata\anaconda3\lib\site-packages (from pandas>=0.24->yfinance) (2.8.1)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.6.1->pandas>=0.24->yfinance) (1.14.0)
[nltk_data] Downloading package stopwords to C:\Users\Shivam
[nltk_data]     Dixit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Requirement already satisfied: wordcloud in c:\programdata\anaconda3\lib\site-packages (1.7.0)
Requirement already satisfied: numpy>=1.6.1 in c:\programdata\anaconda3\lib\site-packages (from wordcloud) (1.18.3)
Requirement already satisfied: pillow in c:\programdata\anaconda3\lib\site-packages (from wordcloud) (4.2.1)
Requirement already satisfied: matplotlib in c:\programdata\anaconda3\lib\site-packages (from wordcloud) (2.2.2)
Requirement already satisfied: olefile in c:\programdata\anaconda3\lib\site-packages (from pillow->wordcloud) (0.46)
Requirement already satisfied: cycler>=0.10 in c:\programdata\anaconda3\lib\site-packages (from matplotlib->wordcloud) (0.10.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.4.7)
Requirement already satisfied: python-dateutil>=2.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.8.1)
Requirement already satisfied: pytz in c:\programdata\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2020.1)
Requirement already satisfied: six>=1.10 in c:\programdata\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.14.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\programdata\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.2.0)
Requirement already satisfied: vadersentiment in c:\programdata\anaconda3\lib\site-packages (3.3.1)
Requirement already satisfied: feedparser in c:\programdata\anaconda3\lib\site-packages (5.2.1)
Requirement already satisfied: newspaper3k in c:\programdata\anaconda3\lib\site-packages (0.2.8)
Requirement already satisfied: nltk>=3.2.1 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (3.5)
Requirement already satisfied: cssselect>=0.9.2 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (1.1.0)
Requirement already satisfied: feedparser>=5.2.1 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (5.2.1)
Requirement already satisfied: tldextract>=2.0.1 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (2.2.2)
Requirement already satisfied: beautifulsoup4>=4.4.1 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (4.9.0)
Requirement already satisfied: requests>=2.10.0 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (2.23.0)
Requirement already satisfied: lxml>=3.6.0 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (4.5.0)
Requirement already satisfied: feedfinder2>=0.0.4 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (0.0.4)
Requirement already satisfied: PyYAML>=3.11 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (5.3.1)
Requirement already satisfied: tinysegmenter==0.3 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (0.3)
Requirement already satisfied: python-dateutil>=2.5.3 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (2.8.1)
Requirement already satisfied: Pillow>=3.3.0 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (4.2.1)
Requirement already satisfied: jieba3k>=0.35.1 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (0.35.1)
Requirement already satisfied: joblib in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.2.1->newspaper3k) (0.14.1)
Requirement already satisfied: regex in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.2.1->newspaper3k) (2020.5.7)
Requirement already satisfied: click in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.2.1->newspaper3k) (7.1.2)
Requirement already satisfied: tqdm in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.2.1->newspaper3k) (4.46.0)
Requirement already satisfied: idna in c:\programdata\anaconda3\lib\site-packages (from tldextract>=2.0.1->newspaper3k) (2.9)
Requirement already satisfied: setuptools in c:\programdata\anaconda3\lib\site-packages (from tldextract>=2.0.1->newspaper3k) (46.2.0.post20200511)
Requirement already satisfied: requests-file>=1.4 in c:\programdata\anaconda3\lib\site-packages (from tldextract>=2.0.1->newspaper3k) (1.5.1)
Requirement already satisfied: soupsieve>1.2 in c:\programdata\anaconda3\lib\site-packages (from beautifulsoup4>=4.4.1->newspaper3k) (2.0)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.10.0->newspaper3k) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.10.0->newspaper3k) (2020.4.5.1)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.10.0->newspaper3k) (1.25.8)
Requirement already satisfied: six in c:\programdata\anaconda3\lib\site-packages (from feedfinder2>=0.0.4->newspaper3k) (1.14.0)
Requirement already satisfied: olefile in c:\programdata\anaconda3\lib\site-packages (from Pillow>=3.3.0->newspaper3k) (0.46)
[nltk_data] Downloading package wordnet to C:\Users\Shivam
[nltk_data]     Dixit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Requirement already satisfied: feedparser in c:\programdata\anaconda3\lib\site-packages (5.2.1)
Requirement already satisfied: newspaper3k in c:\programdata\anaconda3\lib\site-packages (0.2.8)
Requirement already satisfied: lxml>=3.6.0 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (4.5.0)
Requirement already satisfied: beautifulsoup4>=4.4.1 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (4.9.0)
Requirement already satisfied: python-dateutil>=2.5.3 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (2.8.1)
Requirement already satisfied: tinysegmenter==0.3 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (0.3)
Requirement already satisfied: jieba3k>=0.35.1 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (0.35.1)
Requirement already satisfied: feedparser>=5.2.1 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (5.2.1)
Requirement already satisfied: tldextract>=2.0.1 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (2.2.2)
Requirement already satisfied: requests>=2.10.0 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (2.23.0)
Requirement already satisfied: PyYAML>=3.11 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (5.3.1)
Requirement already satisfied: cssselect>=0.9.2 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (1.1.0)
Requirement already satisfied: Pillow>=3.3.0 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (4.2.1)
Requirement already satisfied: nltk>=3.2.1 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (3.5)
Requirement already satisfied: feedfinder2>=0.0.4 in c:\programdata\anaconda3\lib\site-packages (from newspaper3k) (0.0.4)
Requirement already satisfied: soupsieve>1.2 in c:\programdata\anaconda3\lib\site-packages (from beautifulsoup4>=4.4.1->newspaper3k) (2.0)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2.5.3->newspaper3k) (1.14.0)
Requirement already satisfied: requests-file>=1.4 in c:\programdata\anaconda3\lib\site-packages (from tldextract>=2.0.1->newspaper3k) (1.5.1)
Requirement already satisfied: setuptools in c:\programdata\anaconda3\lib\site-packages (from tldextract>=2.0.1->newspaper3k) (46.2.0.post20200511)
Requirement already satisfied: idna in c:\programdata\anaconda3\lib\site-packages (from tldextract>=2.0.1->newspaper3k) (2.9)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.10.0->newspaper3k) (1.25.8)
Requirement already satisfied: certifi>=2017.4.17 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.10.0->newspaper3k) (2020.4.5.1)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\programdata\anaconda3\lib\site-packages (from requests>=2.10.0->newspaper3k) (3.0.4)
Requirement already satisfied: olefile in c:\programdata\anaconda3\lib\site-packages (from Pillow>=3.3.0->newspaper3k) (0.46)
Requirement already satisfied: click in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.2.1->newspaper3k) (7.1.2)
Requirement already satisfied: tqdm in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.2.1->newspaper3k) (4.46.0)
Requirement already satisfied: joblib in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.2.1->newspaper3k) (0.14.1)
Requirement already satisfied: regex in c:\programdata\anaconda3\lib\site-packages (from nltk>=3.2.1->newspaper3k) (2020.5.7)
[nltk_data] Downloading package vader_lexicon to C:\Users\Shivam
[nltk_data]     Dixit\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
In [2]:
#This will be used when creating graphs to order the days and months
order_days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
order_month = ['January','February','March','April','May','June','July','August','September','October','November','December']

Loading Data

Stock Indices Data

FTSE 100

In [3]:
ftse = pd.read_csv('FTSE 100 Historical Data.csv')
ftse.head()
Out[3]:
Date Price Open High Low Volume Chg%
0 Dec 31, 2019 7,542.44 7,587.05 7,587.41 7,532.38 212.14M -0.59%
1 Dec 30, 2019 7,587.05 7,644.90 7,644.90 7,587.05 304.24M -0.76%
2 Dec 27, 2019 7,644.90 7,632.24 7,665.40 7,622.46 311.11M 0.17%
3 Dec 24, 2019 7,632.24 7,623.59 7,636.16 7,608.22 185.67M 0.11%
4 Dec 23, 2019 7,623.59 7,582.48 7,628.20 7,559.63 539.14M 0.54%
In [4]:
ftse.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1010 entries, 0 to 1009
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    1010 non-null   object
 1   Price   1010 non-null   object
 2   Open    1010 non-null   object
 3   High    1010 non-null   object
 4   Low     1010 non-null   object
 5   Volume  1010 non-null   object
 6   Chg%    1010 non-null   object
dtypes: object(7)
memory usage: 55.4+ KB
In [5]:
ftse.describe()
Out[5]:
Date Price Open High Low Volume Chg%
count 1010 1010 1010 1010 1010 1010 1010
unique 1010 1006 1006 1008 1001 920 313
top Jun 29, 2016 7,328.92 7,328.92 7,180.71 6,036.70 1.01B -0.15%
freq 1 2 2 2 2 10 11
In [6]:
#use yfinance and pandas data reader to import the S&P500 index values 
SP500 = pdr.get_data_yahoo('^GSPC', 
                          start='2016, 01, 1', 
                          end='2019, 12, 31')
SP500.reset_index(inplace=True)
SP500.head()
Out[6]:
Date High Low Open Close Volume Adj Close
0 2015-12-31 2062.540039 2043.619995 2060.590088 2043.939941 2655330000 2043.939941
1 2016-01-04 2038.199951 1989.680054 2038.199951 2012.660034 4304880000 2012.660034
2 2016-01-05 2021.939941 2004.170044 2013.780029 2016.709961 3706620000 2016.709961
3 2016-01-06 2011.709961 1979.050049 2011.709961 1990.260010 4336660000 1990.260010
4 2016-01-07 1985.319946 1938.829956 1985.319946 1943.089966 5076590000 1943.089966
In [7]:
SP500.describe()
Out[7]:
High Low Open Close Volume Adj Close
count 1007.000000 1007.000000 1007.000000 1007.000000 1.007000e+03 1007.000000
mean 2560.390457 2538.246376 2549.895392 2550.228200 3.616929e+09 2550.228200
std 334.291490 332.708884 333.440880 333.353202 6.783592e+08 333.353202
min 1847.000000 1810.099976 1833.400024 1829.079956 1.296540e+09 1829.079956
25% 2271.895020 2258.625000 2267.295044 2265.189941 3.230565e+09 2265.189941
50% 2627.689941 2585.889893 2602.659912 2602.419922 3.511530e+09 2602.419922
75% 2829.949951 2803.724976 2818.545044 2817.330078 3.871780e+09 2817.330078
max 3247.929932 3234.370117 3247.229980 3240.020020 7.609010e+09 3240.020020
In [8]:
#Import the exchange rate dataset for AUD/USD
FXUSD = pd.read_csv('USD_GBP Historical Data.csv')
FXUSD.head()
Out[8]:
Date Price Open High Low Volume Chg%
0 Dec 31, 2019 1.3261 1.3117 1.3285 1.3106 97.86K 1.11%
1 Dec 30, 2019 1.3115 1.3067 1.3152 1.3066 100.82K 0.29%
2 Dec 27, 2019 1.3077 1.2993 1.3118 1.2967 85.56K 0.65%
3 Dec 26, 2019 1.2993 1.2961 1.3016 1.2951 99.65K 0.25%
4 Dec 25, 2019 1.2961 1.2942 1.2989 1.2918 34.43K 0.12%
In [9]:
FXUSD.describe()
Out[9]:
Price Open High Low
count 1042.000000 1042.000000 1042.000000 1042.000000
mean 1.314114 1.314188 1.319984 1.308206
std 0.063706 0.063918 0.064487 0.063344
min 1.203700 1.204400 1.209000 1.145200
25% 1.271900 1.272300 1.275925 1.266300
50% 1.303350 1.303200 1.308550 1.298500
75% 1.338425 1.338600 1.344475 1.333100
max 1.487900 1.487200 1.502000 1.473900

Comments
FTSE 100 data and USD/GBP exchange rate was obtained from investing.com[9][11] and S&P500 information uses the yfinance and pandas datareader package[12][13].

News Data

In [10]:
guardian_api = '8ca75031-03ce-4640-80b3-9accc793d8e6'
In [243]:
ARTICLES_DIR = join('tempdata', 'newsarticles') #Create a temporary directory to save the news articles
makedirs(ARTICLES_DIR, exist_ok=True)
# Sample URL - obtained from guardian to search for news articles
#
# http://content.guardianapis.com/search?q=news
# &api-key=your-api-key-goes-here

API_ENDPOINT = 'http://content.guardianapis.com/search?q=news'
my_params = {
    'from-date': "2016-01-01",
    'to-date': "2019-12-31",
    'order-by': "newest",
    'show-fields': 'all',
    'page-size': 200,
    'api-key': guardian_api
}
In [244]:
my_params
Out[244]:
{'from-date': '2016-01-01',
 'to-date': '2019-12-31',
 'order-by': 'newest',
 'show-fields': 'all',
 'page-size': 200,
 'api-key': '8ca75031-03ce-4640-80b3-9accc793d8e6'}
In [13]:
start_date = date(2016, 1, 1)
end_date = date(2019,12, 31) #ONLY ALLOWED 5,000 API REQUESTS SO SPLIT INTO BATCHES
dayrange = range((end_date - start_date).days + 1)
for daycount in dayrange:
    dt = start_date + timedelta(days=daycount)
    datestr = dt.strftime('%Y-%m-%d')
    fname = join(ARTICLES_DIR, datestr + '.json')
    if not exists(fname):
        # then let's download it
        print("Downloading", datestr)
        all_results = []
        my_params['from-date'] = datestr
        my_params['to-date'] = datestr
        current_page = 1
        total_pages = 1
        while current_page <= total_pages:
            print("...page", current_page)
            my_params['page'] = current_page
            resp = requests.get(API_ENDPOINT, my_params)
            data = resp.json()
            all_results.extend(data['response']['results'])
            # if there is more than one page
            current_page += 1
            total_pages = data['response']['pages']

        with open(fname, 'w') as f:
            print("Writing to", fname)

            # re-serialize it for pretty indentation
            f.write(json.dumps(all_results, indent=2)) #create separate json files for each day and save it in the tempdata/articles/ folder

Read in the JSON files

In [14]:
#Find names of all files in the folder tempdata/newsarticles/ with the extension .json
pattern = 'tempdata/newsarticles/*.json'

json_files = glob.glob(pattern)

print(json_files)
['tempdata/newsarticles\\2016-01-01.json', 'tempdata/newsarticles\\2016-01-02.json', 'tempdata/newsarticles\\2016-01-03.json', 'tempdata/newsarticles\\2016-01-04.json', 'tempdata/newsarticles\\2016-01-05.json', 'tempdata/newsarticles\\2016-01-06.json', 'tempdata/newsarticles\\2016-01-07.json', 'tempdata/newsarticles\\2016-01-08.json', 'tempdata/newsarticles\\2016-01-09.json', 'tempdata/newsarticles\\2016-01-10.json', 'tempdata/newsarticles\\2016-01-11.json', 'tempdata/newsarticles\\2016-01-12.json', 'tempdata/newsarticles\\2016-01-13.json', 'tempdata/newsarticles\\2016-01-14.json', 'tempdata/newsarticles\\2016-01-15.json', 'tempdata/newsarticles\\2016-01-16.json', 'tempdata/newsarticles\\2016-01-17.json', 'tempdata/newsarticles\\2016-01-18.json', 'tempdata/newsarticles\\2016-01-19.json', 'tempdata/newsarticles\\2016-01-20.json', 'tempdata/newsarticles\\2016-01-21.json', 'tempdata/newsarticles\\2016-01-22.json', 'tempdata/newsarticles\\2016-01-23.json', 'tempdata/newsarticles\\2016-01-24.json', 'tempdata/newsarticles\\2016-01-25.json', 'tempdata/newsarticles\\2016-01-26.json', 'tempdata/newsarticles\\2016-01-27.json', 'tempdata/newsarticles\\2016-01-28.json', 'tempdata/newsarticles\\2016-01-29.json', 'tempdata/newsarticles\\2016-01-30.json', 'tempdata/newsarticles\\2016-01-31.json', 'tempdata/newsarticles\\2016-02-01.json', 'tempdata/newsarticles\\2016-02-02.json', 'tempdata/newsarticles\\2016-02-03.json', 'tempdata/newsarticles\\2016-02-04.json', 'tempdata/newsarticles\\2016-02-05.json', 'tempdata/newsarticles\\2016-02-06.json', 'tempdata/newsarticles\\2016-02-07.json', 'tempdata/newsarticles\\2016-02-08.json', 'tempdata/newsarticles\\2016-02-09.json', 'tempdata/newsarticles\\2016-02-10.json', 'tempdata/newsarticles\\2016-02-11.json', 'tempdata/newsarticles\\2016-02-12.json', 'tempdata/newsarticles\\2016-02-13.json', 'tempdata/newsarticles\\2016-02-14.json', 'tempdata/newsarticles\\2016-02-15.json', 'tempdata/newsarticles\\2016-02-16.json', 'tempdata/newsarticles\\2016-02-17.json', 'tempdata/newsarticles\\2016-02-18.json', 'tempdata/newsarticles\\2016-02-19.json', 'tempdata/newsarticles\\2016-02-20.json', 'tempdata/newsarticles\\2016-02-21.json', 'tempdata/newsarticles\\2016-02-22.json', 'tempdata/newsarticles\\2016-02-23.json', 'tempdata/newsarticles\\2016-02-24.json', 'tempdata/newsarticles\\2016-02-25.json', 'tempdata/newsarticles\\2016-02-26.json', 'tempdata/newsarticles\\2016-02-27.json', 'tempdata/newsarticles\\2016-02-28.json', 'tempdata/newsarticles\\2016-02-29.json', 'tempdata/newsarticles\\2016-03-01.json', 'tempdata/newsarticles\\2016-03-02.json', 'tempdata/newsarticles\\2016-03-03.json', 'tempdata/newsarticles\\2016-03-04.json', 'tempdata/newsarticles\\2016-03-05.json', 'tempdata/newsarticles\\2016-03-06.json', 'tempdata/newsarticles\\2016-03-07.json', 'tempdata/newsarticles\\2016-03-08.json', 'tempdata/newsarticles\\2016-03-09.json', 'tempdata/newsarticles\\2016-03-10.json', 'tempdata/newsarticles\\2016-03-11.json', 'tempdata/newsarticles\\2016-03-12.json', 'tempdata/newsarticles\\2016-03-13.json', 'tempdata/newsarticles\\2016-03-14.json', 'tempdata/newsarticles\\2016-03-15.json', 'tempdata/newsarticles\\2016-03-16.json', 'tempdata/newsarticles\\2016-03-17.json', 'tempdata/newsarticles\\2016-03-18.json', 'tempdata/newsarticles\\2016-03-19.json', 'tempdata/newsarticles\\2016-03-20.json', 'tempdata/newsarticles\\2016-03-21.json', 'tempdata/newsarticles\\2016-03-22.json', 'tempdata/newsarticles\\2016-03-23.json', 'tempdata/newsarticles\\2016-03-24.json', 'tempdata/newsarticles\\2016-03-25.json', 'tempdata/newsarticles\\2016-03-26.json', 'tempdata/newsarticles\\2016-03-27.json', 'tempdata/newsarticles\\2016-03-28.json', 'tempdata/newsarticles\\2016-03-29.json', 'tempdata/newsarticles\\2016-03-30.json', 'tempdata/newsarticles\\2016-03-31.json', 'tempdata/newsarticles\\2016-04-01.json', 'tempdata/newsarticles\\2016-04-02.json', 'tempdata/newsarticles\\2016-04-03.json', 'tempdata/newsarticles\\2016-04-04.json', 'tempdata/newsarticles\\2016-04-05.json', 'tempdata/newsarticles\\2016-04-06.json', 'tempdata/newsarticles\\2016-04-07.json', 'tempdata/newsarticles\\2016-04-08.json', 'tempdata/newsarticles\\2016-04-09.json', 'tempdata/newsarticles\\2016-04-10.json', 'tempdata/newsarticles\\2016-04-11.json', 'tempdata/newsarticles\\2016-04-12.json', 'tempdata/newsarticles\\2016-04-13.json', 'tempdata/newsarticles\\2016-04-14.json', 'tempdata/newsarticles\\2016-04-15.json', 'tempdata/newsarticles\\2016-04-16.json', 'tempdata/newsarticles\\2016-04-17.json', 'tempdata/newsarticles\\2016-04-18.json', 'tempdata/newsarticles\\2016-04-19.json', 'tempdata/newsarticles\\2016-04-20.json', 'tempdata/newsarticles\\2016-04-21.json', 'tempdata/newsarticles\\2016-04-22.json', 'tempdata/newsarticles\\2016-04-23.json', 'tempdata/newsarticles\\2016-04-24.json', 'tempdata/newsarticles\\2016-04-25.json', 'tempdata/newsarticles\\2016-04-26.json', 'tempdata/newsarticles\\2016-04-27.json', 'tempdata/newsarticles\\2016-04-28.json', 'tempdata/newsarticles\\2016-04-29.json', 'tempdata/newsarticles\\2016-04-30.json', 'tempdata/newsarticles\\2016-05-01.json', 'tempdata/newsarticles\\2016-05-02.json', 'tempdata/newsarticles\\2016-05-03.json', 'tempdata/newsarticles\\2016-05-04.json', 'tempdata/newsarticles\\2016-05-05.json', 'tempdata/newsarticles\\2016-05-06.json', 'tempdata/newsarticles\\2016-05-07.json', 'tempdata/newsarticles\\2016-05-08.json', 'tempdata/newsarticles\\2016-05-09.json', 'tempdata/newsarticles\\2016-05-10.json', 'tempdata/newsarticles\\2016-05-11.json', 'tempdata/newsarticles\\2016-05-12.json', 'tempdata/newsarticles\\2016-05-13.json', 'tempdata/newsarticles\\2016-05-14.json', 'tempdata/newsarticles\\2016-05-15.json', 'tempdata/newsarticles\\2016-05-16.json', 'tempdata/newsarticles\\2016-05-17.json', 'tempdata/newsarticles\\2016-05-18.json', 'tempdata/newsarticles\\2016-05-19.json', 'tempdata/newsarticles\\2016-05-20.json', 'tempdata/newsarticles\\2016-05-21.json', 'tempdata/newsarticles\\2016-05-22.json', 'tempdata/newsarticles\\2016-05-23.json', 'tempdata/newsarticles\\2016-05-24.json', 'tempdata/newsarticles\\2016-05-25.json', 'tempdata/newsarticles\\2016-05-26.json', 'tempdata/newsarticles\\2016-05-27.json', 'tempdata/newsarticles\\2016-05-28.json', 'tempdata/newsarticles\\2016-05-29.json', 'tempdata/newsarticles\\2016-05-30.json', 'tempdata/newsarticles\\2016-05-31.json', 'tempdata/newsarticles\\2016-06-01.json', 'tempdata/newsarticles\\2016-06-02.json', 'tempdata/newsarticles\\2016-06-03.json', 'tempdata/newsarticles\\2016-06-04.json', 'tempdata/newsarticles\\2016-06-05.json', 'tempdata/newsarticles\\2016-06-06.json', 'tempdata/newsarticles\\2016-06-07.json', 'tempdata/newsarticles\\2016-06-08.json', 'tempdata/newsarticles\\2016-06-09.json', 'tempdata/newsarticles\\2016-06-10.json', 'tempdata/newsarticles\\2016-06-11.json', 'tempdata/newsarticles\\2016-06-12.json', 'tempdata/newsarticles\\2016-06-13.json', 'tempdata/newsarticles\\2016-06-14.json', 'tempdata/newsarticles\\2016-06-15.json', 'tempdata/newsarticles\\2016-06-16.json', 'tempdata/newsarticles\\2016-06-17.json', 'tempdata/newsarticles\\2016-06-18.json', 'tempdata/newsarticles\\2016-06-19.json', 'tempdata/newsarticles\\2016-06-20.json', 'tempdata/newsarticles\\2016-06-21.json', 'tempdata/newsarticles\\2016-06-22.json', 'tempdata/newsarticles\\2016-06-23.json', 'tempdata/newsarticles\\2016-06-24.json', 'tempdata/newsarticles\\2016-06-25.json', 'tempdata/newsarticles\\2016-06-26.json', 'tempdata/newsarticles\\2016-06-27.json', 'tempdata/newsarticles\\2016-06-28.json', 'tempdata/newsarticles\\2016-06-29.json', 'tempdata/newsarticles\\2016-06-30.json', 'tempdata/newsarticles\\2016-07-01.json', 'tempdata/newsarticles\\2016-07-02.json', 'tempdata/newsarticles\\2016-07-03.json', 'tempdata/newsarticles\\2016-07-04.json', 'tempdata/newsarticles\\2016-07-05.json', 'tempdata/newsarticles\\2016-07-06.json', 'tempdata/newsarticles\\2016-07-07.json', 'tempdata/newsarticles\\2016-07-08.json', 'tempdata/newsarticles\\2016-07-09.json', 'tempdata/newsarticles\\2016-07-10.json', 'tempdata/newsarticles\\2016-07-11.json', 'tempdata/newsarticles\\2016-07-12.json', 'tempdata/newsarticles\\2016-07-13.json', 'tempdata/newsarticles\\2016-07-14.json', 'tempdata/newsarticles\\2016-07-15.json', 'tempdata/newsarticles\\2016-07-16.json', 'tempdata/newsarticles\\2016-07-17.json', 'tempdata/newsarticles\\2016-07-18.json', 'tempdata/newsarticles\\2016-07-19.json', 'tempdata/newsarticles\\2016-07-20.json', 'tempdata/newsarticles\\2016-07-21.json', 'tempdata/newsarticles\\2016-07-22.json', 'tempdata/newsarticles\\2016-07-23.json', 'tempdata/newsarticles\\2016-07-24.json', 'tempdata/newsarticles\\2016-07-25.json', 'tempdata/newsarticles\\2016-07-26.json', 'tempdata/newsarticles\\2016-07-27.json', 'tempdata/newsarticles\\2016-07-28.json', 'tempdata/newsarticles\\2016-07-29.json', 'tempdata/newsarticles\\2016-07-30.json', 'tempdata/newsarticles\\2016-07-31.json', 'tempdata/newsarticles\\2016-08-01.json', 'tempdata/newsarticles\\2016-08-02.json', 'tempdata/newsarticles\\2016-08-03.json', 'tempdata/newsarticles\\2016-08-04.json', 'tempdata/newsarticles\\2016-08-05.json', 'tempdata/newsarticles\\2016-08-06.json', 'tempdata/newsarticles\\2016-08-07.json', 'tempdata/newsarticles\\2016-08-08.json', 'tempdata/newsarticles\\2016-08-09.json', 'tempdata/newsarticles\\2016-08-10.json', 'tempdata/newsarticles\\2016-08-11.json', 'tempdata/newsarticles\\2016-08-12.json', 'tempdata/newsarticles\\2016-08-13.json', 'tempdata/newsarticles\\2016-08-14.json', 'tempdata/newsarticles\\2016-08-15.json', 'tempdata/newsarticles\\2016-08-16.json', 'tempdata/newsarticles\\2016-08-17.json', 'tempdata/newsarticles\\2016-08-18.json', 'tempdata/newsarticles\\2016-08-19.json', 'tempdata/newsarticles\\2016-08-20.json', 'tempdata/newsarticles\\2016-08-21.json', 'tempdata/newsarticles\\2016-08-22.json', 'tempdata/newsarticles\\2016-08-23.json', 'tempdata/newsarticles\\2016-08-24.json', 'tempdata/newsarticles\\2016-08-25.json', 'tempdata/newsarticles\\2016-08-26.json', 'tempdata/newsarticles\\2016-08-27.json', 'tempdata/newsarticles\\2016-08-28.json', 'tempdata/newsarticles\\2016-08-29.json', 'tempdata/newsarticles\\2016-08-30.json', 'tempdata/newsarticles\\2016-08-31.json', 'tempdata/newsarticles\\2016-09-01.json', 'tempdata/newsarticles\\2016-09-02.json', 'tempdata/newsarticles\\2016-09-03.json', 'tempdata/newsarticles\\2016-09-04.json', 'tempdata/newsarticles\\2016-09-05.json', 'tempdata/newsarticles\\2016-09-06.json', 'tempdata/newsarticles\\2016-09-07.json', 'tempdata/newsarticles\\2016-09-08.json', 'tempdata/newsarticles\\2016-09-09.json', 'tempdata/newsarticles\\2016-09-10.json', 'tempdata/newsarticles\\2016-09-11.json', 'tempdata/newsarticles\\2016-09-12.json', 'tempdata/newsarticles\\2016-09-13.json', 'tempdata/newsarticles\\2016-09-14.json', 'tempdata/newsarticles\\2016-09-15.json', 'tempdata/newsarticles\\2016-09-16.json', 'tempdata/newsarticles\\2016-09-17.json', 'tempdata/newsarticles\\2016-09-18.json', 'tempdata/newsarticles\\2016-09-19.json', 'tempdata/newsarticles\\2016-09-20.json', 'tempdata/newsarticles\\2016-09-21.json', 'tempdata/newsarticles\\2016-09-22.json', 'tempdata/newsarticles\\2016-09-23.json', 'tempdata/newsarticles\\2016-09-24.json', 'tempdata/newsarticles\\2016-09-25.json', 'tempdata/newsarticles\\2016-09-26.json', 'tempdata/newsarticles\\2016-09-27.json', 'tempdata/newsarticles\\2016-09-28.json', 'tempdata/newsarticles\\2016-09-29.json', 'tempdata/newsarticles\\2016-09-30.json', 'tempdata/newsarticles\\2016-10-01.json', 'tempdata/newsarticles\\2016-10-02.json', 'tempdata/newsarticles\\2016-10-03.json', 'tempdata/newsarticles\\2016-10-04.json', 'tempdata/newsarticles\\2016-10-05.json', 'tempdata/newsarticles\\2016-10-06.json', 'tempdata/newsarticles\\2016-10-07.json', 'tempdata/newsarticles\\2016-10-08.json', 'tempdata/newsarticles\\2016-10-09.json', 'tempdata/newsarticles\\2016-10-10.json', 'tempdata/newsarticles\\2016-10-11.json', 'tempdata/newsarticles\\2016-10-12.json', 'tempdata/newsarticles\\2016-10-13.json', 'tempdata/newsarticles\\2016-10-14.json', 'tempdata/newsarticles\\2016-10-15.json', 'tempdata/newsarticles\\2016-10-16.json', 'tempdata/newsarticles\\2016-10-17.json', 'tempdata/newsarticles\\2016-10-18.json', 'tempdata/newsarticles\\2016-10-19.json', 'tempdata/newsarticles\\2016-10-20.json', 'tempdata/newsarticles\\2016-10-21.json', 'tempdata/newsarticles\\2016-10-22.json', 'tempdata/newsarticles\\2016-10-23.json', 'tempdata/newsarticles\\2016-10-24.json', 'tempdata/newsarticles\\2016-10-25.json', 'tempdata/newsarticles\\2016-10-26.json', 'tempdata/newsarticles\\2016-10-27.json', 'tempdata/newsarticles\\2016-10-28.json', 'tempdata/newsarticles\\2016-10-29.json', 'tempdata/newsarticles\\2016-10-30.json', 'tempdata/newsarticles\\2016-10-31.json', 'tempdata/newsarticles\\2016-11-01.json', 'tempdata/newsarticles\\2016-11-02.json', 'tempdata/newsarticles\\2016-11-03.json', 'tempdata/newsarticles\\2016-11-04.json', 'tempdata/newsarticles\\2016-11-05.json', 'tempdata/newsarticles\\2016-11-06.json', 'tempdata/newsarticles\\2016-11-07.json', 'tempdata/newsarticles\\2016-11-08.json', 'tempdata/newsarticles\\2016-11-09.json', 'tempdata/newsarticles\\2016-11-10.json', 'tempdata/newsarticles\\2016-11-11.json', 'tempdata/newsarticles\\2016-11-12.json', 'tempdata/newsarticles\\2016-11-13.json', 'tempdata/newsarticles\\2016-11-14.json', 'tempdata/newsarticles\\2016-11-15.json', 'tempdata/newsarticles\\2016-11-16.json', 'tempdata/newsarticles\\2016-11-17.json', 'tempdata/newsarticles\\2016-11-18.json', 'tempdata/newsarticles\\2016-11-19.json', 'tempdata/newsarticles\\2016-11-20.json', 'tempdata/newsarticles\\2016-11-21.json', 'tempdata/newsarticles\\2016-11-22.json', 'tempdata/newsarticles\\2016-11-23.json', 'tempdata/newsarticles\\2016-11-24.json', 'tempdata/newsarticles\\2016-11-25.json', 'tempdata/newsarticles\\2016-11-26.json', 'tempdata/newsarticles\\2016-11-27.json', 'tempdata/newsarticles\\2016-11-28.json', 'tempdata/newsarticles\\2016-11-29.json', 'tempdata/newsarticles\\2016-11-30.json', 'tempdata/newsarticles\\2016-12-01.json', 'tempdata/newsarticles\\2016-12-02.json', 'tempdata/newsarticles\\2016-12-03.json', 'tempdata/newsarticles\\2016-12-04.json', 'tempdata/newsarticles\\2016-12-05.json', 'tempdata/newsarticles\\2016-12-06.json', 'tempdata/newsarticles\\2016-12-07.json', 'tempdata/newsarticles\\2016-12-08.json', 'tempdata/newsarticles\\2016-12-09.json', 'tempdata/newsarticles\\2016-12-10.json', 'tempdata/newsarticles\\2016-12-11.json', 'tempdata/newsarticles\\2016-12-12.json', 'tempdata/newsarticles\\2016-12-13.json', 'tempdata/newsarticles\\2016-12-14.json', 'tempdata/newsarticles\\2016-12-15.json', 'tempdata/newsarticles\\2016-12-16.json', 'tempdata/newsarticles\\2016-12-17.json', 'tempdata/newsarticles\\2016-12-18.json', 'tempdata/newsarticles\\2016-12-19.json', 'tempdata/newsarticles\\2016-12-20.json', 'tempdata/newsarticles\\2016-12-21.json', 'tempdata/newsarticles\\2016-12-22.json', 'tempdata/newsarticles\\2016-12-23.json', 'tempdata/newsarticles\\2016-12-24.json', 'tempdata/newsarticles\\2016-12-25.json', 'tempdata/newsarticles\\2016-12-26.json', 'tempdata/newsarticles\\2016-12-27.json', 'tempdata/newsarticles\\2016-12-28.json', 'tempdata/newsarticles\\2016-12-29.json', 'tempdata/newsarticles\\2016-12-30.json', 'tempdata/newsarticles\\2016-12-31.json', 'tempdata/newsarticles\\2017-01-01.json', 'tempdata/newsarticles\\2017-01-02.json', 'tempdata/newsarticles\\2017-01-03.json', 'tempdata/newsarticles\\2017-01-04.json', 'tempdata/newsarticles\\2017-01-05.json', 'tempdata/newsarticles\\2017-01-06.json', 'tempdata/newsarticles\\2017-01-07.json', 'tempdata/newsarticles\\2017-01-08.json', 'tempdata/newsarticles\\2017-01-09.json', 'tempdata/newsarticles\\2017-01-10.json', 'tempdata/newsarticles\\2017-01-11.json', 'tempdata/newsarticles\\2017-01-12.json', 'tempdata/newsarticles\\2017-01-13.json', 'tempdata/newsarticles\\2017-01-14.json', 'tempdata/newsarticles\\2017-01-15.json', 'tempdata/newsarticles\\2017-01-16.json', 'tempdata/newsarticles\\2017-01-17.json', 'tempdata/newsarticles\\2017-01-18.json', 'tempdata/newsarticles\\2017-01-19.json', 'tempdata/newsarticles\\2017-01-20.json', 'tempdata/newsarticles\\2017-01-21.json', 'tempdata/newsarticles\\2017-01-22.json', 'tempdata/newsarticles\\2017-01-23.json', 'tempdata/newsarticles\\2017-01-24.json', 'tempdata/newsarticles\\2017-01-25.json', 'tempdata/newsarticles\\2017-01-26.json', 'tempdata/newsarticles\\2017-01-27.json', 'tempdata/newsarticles\\2017-01-28.json', 'tempdata/newsarticles\\2017-01-29.json', 'tempdata/newsarticles\\2017-01-30.json', 'tempdata/newsarticles\\2017-01-31.json', 'tempdata/newsarticles\\2017-02-01.json', 'tempdata/newsarticles\\2017-02-02.json', 'tempdata/newsarticles\\2017-02-03.json', 'tempdata/newsarticles\\2017-02-04.json', 'tempdata/newsarticles\\2017-02-05.json', 'tempdata/newsarticles\\2017-02-06.json', 'tempdata/newsarticles\\2017-02-07.json', 'tempdata/newsarticles\\2017-02-08.json', 'tempdata/newsarticles\\2017-02-09.json', 'tempdata/newsarticles\\2017-02-10.json', 'tempdata/newsarticles\\2017-02-11.json', 'tempdata/newsarticles\\2017-02-12.json', 'tempdata/newsarticles\\2017-02-13.json', 'tempdata/newsarticles\\2017-02-14.json', 'tempdata/newsarticles\\2017-02-15.json', 'tempdata/newsarticles\\2017-02-16.json', 'tempdata/newsarticles\\2017-02-17.json', 'tempdata/newsarticles\\2017-02-18.json', 'tempdata/newsarticles\\2017-02-19.json', 'tempdata/newsarticles\\2017-02-20.json', 'tempdata/newsarticles\\2017-02-21.json', 'tempdata/newsarticles\\2017-02-22.json', 'tempdata/newsarticles\\2017-02-23.json', 'tempdata/newsarticles\\2017-02-24.json', 'tempdata/newsarticles\\2017-02-25.json', 'tempdata/newsarticles\\2017-02-26.json', 'tempdata/newsarticles\\2017-02-27.json', 'tempdata/newsarticles\\2017-02-28.json', 'tempdata/newsarticles\\2017-03-01.json', 'tempdata/newsarticles\\2017-03-02.json', 'tempdata/newsarticles\\2017-03-03.json', 'tempdata/newsarticles\\2017-03-04.json', 'tempdata/newsarticles\\2017-03-05.json', 'tempdata/newsarticles\\2017-03-06.json', 'tempdata/newsarticles\\2017-03-07.json', 'tempdata/newsarticles\\2017-03-08.json', 'tempdata/newsarticles\\2017-03-09.json', 'tempdata/newsarticles\\2017-03-10.json', 'tempdata/newsarticles\\2017-03-11.json', 'tempdata/newsarticles\\2017-03-12.json', 'tempdata/newsarticles\\2017-03-13.json', 'tempdata/newsarticles\\2017-03-14.json', 'tempdata/newsarticles\\2017-03-15.json', 'tempdata/newsarticles\\2017-03-16.json', 'tempdata/newsarticles\\2017-03-17.json', 'tempdata/newsarticles\\2017-03-18.json', 'tempdata/newsarticles\\2017-03-19.json', 'tempdata/newsarticles\\2017-03-20.json', 'tempdata/newsarticles\\2017-03-21.json', 'tempdata/newsarticles\\2017-03-22.json', 'tempdata/newsarticles\\2017-03-23.json', 'tempdata/newsarticles\\2017-03-24.json', 'tempdata/newsarticles\\2017-03-25.json', 'tempdata/newsarticles\\2017-03-26.json', 'tempdata/newsarticles\\2017-03-27.json', 'tempdata/newsarticles\\2017-03-28.json', 'tempdata/newsarticles\\2017-03-29.json', 'tempdata/newsarticles\\2017-03-30.json', 'tempdata/newsarticles\\2017-03-31.json', 'tempdata/newsarticles\\2017-04-01.json', 'tempdata/newsarticles\\2017-04-02.json', 'tempdata/newsarticles\\2017-04-03.json', 'tempdata/newsarticles\\2017-04-04.json', 'tempdata/newsarticles\\2017-04-05.json', 'tempdata/newsarticles\\2017-04-06.json', 'tempdata/newsarticles\\2017-04-07.json', 'tempdata/newsarticles\\2017-04-08.json', 'tempdata/newsarticles\\2017-04-09.json', 'tempdata/newsarticles\\2017-04-10.json', 'tempdata/newsarticles\\2017-04-11.json', 'tempdata/newsarticles\\2017-04-12.json', 'tempdata/newsarticles\\2017-04-13.json', 'tempdata/newsarticles\\2017-04-14.json', 'tempdata/newsarticles\\2017-04-15.json', 'tempdata/newsarticles\\2017-04-16.json', 'tempdata/newsarticles\\2017-04-17.json', 'tempdata/newsarticles\\2017-04-18.json', 'tempdata/newsarticles\\2017-04-19.json', 'tempdata/newsarticles\\2017-04-20.json', 'tempdata/newsarticles\\2017-04-21.json', 'tempdata/newsarticles\\2017-04-22.json', 'tempdata/newsarticles\\2017-04-23.json', 'tempdata/newsarticles\\2017-04-24.json', 'tempdata/newsarticles\\2017-04-25.json', 'tempdata/newsarticles\\2017-04-26.json', 'tempdata/newsarticles\\2017-04-27.json', 'tempdata/newsarticles\\2017-04-28.json', 'tempdata/newsarticles\\2017-04-29.json', 'tempdata/newsarticles\\2017-04-30.json', 'tempdata/newsarticles\\2017-05-01.json', 'tempdata/newsarticles\\2017-05-02.json', 'tempdata/newsarticles\\2017-05-03.json', 'tempdata/newsarticles\\2017-05-04.json', 'tempdata/newsarticles\\2017-05-05.json', 'tempdata/newsarticles\\2017-05-06.json', 'tempdata/newsarticles\\2017-05-07.json', 'tempdata/newsarticles\\2017-05-08.json', 'tempdata/newsarticles\\2017-05-09.json', 'tempdata/newsarticles\\2017-05-10.json', 'tempdata/newsarticles\\2017-05-11.json', 'tempdata/newsarticles\\2017-05-12.json', 'tempdata/newsarticles\\2017-05-13.json', 'tempdata/newsarticles\\2017-05-14.json', 'tempdata/newsarticles\\2017-05-15.json', 'tempdata/newsarticles\\2017-05-16.json', 'tempdata/newsarticles\\2017-05-17.json', 'tempdata/newsarticles\\2017-05-18.json', 'tempdata/newsarticles\\2017-05-19.json', 'tempdata/newsarticles\\2017-05-20.json', 'tempdata/newsarticles\\2017-05-21.json', 'tempdata/newsarticles\\2017-05-22.json', 'tempdata/newsarticles\\2017-05-23.json', 'tempdata/newsarticles\\2017-05-24.json', 'tempdata/newsarticles\\2017-05-25.json', 'tempdata/newsarticles\\2017-05-26.json', 'tempdata/newsarticles\\2017-05-27.json', 'tempdata/newsarticles\\2017-05-28.json', 'tempdata/newsarticles\\2017-05-29.json', 'tempdata/newsarticles\\2017-05-30.json', 'tempdata/newsarticles\\2017-05-31.json', 'tempdata/newsarticles\\2017-06-01.json', 'tempdata/newsarticles\\2017-06-02.json', 'tempdata/newsarticles\\2017-06-03.json', 'tempdata/newsarticles\\2017-06-04.json', 'tempdata/newsarticles\\2017-06-05.json', 'tempdata/newsarticles\\2017-06-06.json', 'tempdata/newsarticles\\2017-06-07.json', 'tempdata/newsarticles\\2017-06-08.json', 'tempdata/newsarticles\\2017-06-09.json', 'tempdata/newsarticles\\2017-06-10.json', 'tempdata/newsarticles\\2017-06-11.json', 'tempdata/newsarticles\\2017-06-12.json', 'tempdata/newsarticles\\2017-06-13.json', 'tempdata/newsarticles\\2017-06-14.json', 'tempdata/newsarticles\\2017-06-15.json', 'tempdata/newsarticles\\2017-06-16.json', 'tempdata/newsarticles\\2017-06-17.json', 'tempdata/newsarticles\\2017-06-18.json', 'tempdata/newsarticles\\2017-06-19.json', 'tempdata/newsarticles\\2017-06-20.json', 'tempdata/newsarticles\\2017-06-21.json', 'tempdata/newsarticles\\2017-06-22.json', 'tempdata/newsarticles\\2017-06-23.json', 'tempdata/newsarticles\\2017-06-24.json', 'tempdata/newsarticles\\2017-06-25.json', 'tempdata/newsarticles\\2017-06-26.json', 'tempdata/newsarticles\\2017-06-27.json', 'tempdata/newsarticles\\2017-06-28.json', 'tempdata/newsarticles\\2017-06-29.json', 'tempdata/newsarticles\\2017-06-30.json', 'tempdata/newsarticles\\2017-07-01.json', 'tempdata/newsarticles\\2017-07-02.json', 'tempdata/newsarticles\\2017-07-03.json', 'tempdata/newsarticles\\2017-07-04.json', 'tempdata/newsarticles\\2017-07-05.json', 'tempdata/newsarticles\\2017-07-06.json', 'tempdata/newsarticles\\2017-07-07.json', 'tempdata/newsarticles\\2017-07-08.json', 'tempdata/newsarticles\\2017-07-09.json', 'tempdata/newsarticles\\2017-07-10.json', 'tempdata/newsarticles\\2017-07-11.json', 'tempdata/newsarticles\\2017-07-12.json', 'tempdata/newsarticles\\2017-07-13.json', 'tempdata/newsarticles\\2017-07-14.json', 'tempdata/newsarticles\\2017-07-15.json', 'tempdata/newsarticles\\2017-07-16.json', 'tempdata/newsarticles\\2017-07-17.json', 'tempdata/newsarticles\\2017-07-18.json', 'tempdata/newsarticles\\2017-07-19.json', 'tempdata/newsarticles\\2017-07-20.json', 'tempdata/newsarticles\\2017-07-21.json', 'tempdata/newsarticles\\2017-07-22.json', 'tempdata/newsarticles\\2017-07-23.json', 'tempdata/newsarticles\\2017-07-24.json', 'tempdata/newsarticles\\2017-07-25.json', 'tempdata/newsarticles\\2017-07-26.json', 'tempdata/newsarticles\\2017-07-27.json', 'tempdata/newsarticles\\2017-07-28.json', 'tempdata/newsarticles\\2017-07-29.json', 'tempdata/newsarticles\\2017-07-30.json', 'tempdata/newsarticles\\2017-07-31.json', 'tempdata/newsarticles\\2017-08-01.json', 'tempdata/newsarticles\\2017-08-02.json', 'tempdata/newsarticles\\2017-08-03.json', 'tempdata/newsarticles\\2017-08-04.json', 'tempdata/newsarticles\\2017-08-05.json', 'tempdata/newsarticles\\2017-08-06.json', 'tempdata/newsarticles\\2017-08-07.json', 'tempdata/newsarticles\\2017-08-08.json', 'tempdata/newsarticles\\2017-08-09.json', 'tempdata/newsarticles\\2017-08-10.json', 'tempdata/newsarticles\\2017-08-11.json', 'tempdata/newsarticles\\2017-08-12.json', 'tempdata/newsarticles\\2017-08-13.json', 'tempdata/newsarticles\\2017-08-14.json', 'tempdata/newsarticles\\2017-08-15.json', 'tempdata/newsarticles\\2017-08-16.json', 'tempdata/newsarticles\\2017-08-17.json', 'tempdata/newsarticles\\2017-08-18.json', 'tempdata/newsarticles\\2017-08-19.json', 'tempdata/newsarticles\\2017-08-20.json', 'tempdata/newsarticles\\2017-08-21.json', 'tempdata/newsarticles\\2017-08-22.json', 'tempdata/newsarticles\\2017-08-23.json', 'tempdata/newsarticles\\2017-08-24.json', 'tempdata/newsarticles\\2017-08-25.json', 'tempdata/newsarticles\\2017-08-26.json', 'tempdata/newsarticles\\2017-08-27.json', 'tempdata/newsarticles\\2017-08-28.json', 'tempdata/newsarticles\\2017-08-29.json', 'tempdata/newsarticles\\2017-08-30.json', 'tempdata/newsarticles\\2017-08-31.json', 'tempdata/newsarticles\\2017-09-01.json', 'tempdata/newsarticles\\2017-09-02.json', 'tempdata/newsarticles\\2017-09-03.json', 'tempdata/newsarticles\\2017-09-04.json', 'tempdata/newsarticles\\2017-09-05.json', 'tempdata/newsarticles\\2017-09-06.json', 'tempdata/newsarticles\\2017-09-07.json', 'tempdata/newsarticles\\2017-09-08.json', 'tempdata/newsarticles\\2017-09-09.json', 'tempdata/newsarticles\\2017-09-10.json', 'tempdata/newsarticles\\2017-09-11.json', 'tempdata/newsarticles\\2017-09-12.json', 'tempdata/newsarticles\\2017-09-13.json', 'tempdata/newsarticles\\2017-09-14.json', 'tempdata/newsarticles\\2017-09-15.json', 'tempdata/newsarticles\\2017-09-16.json', 'tempdata/newsarticles\\2017-09-17.json', 'tempdata/newsarticles\\2017-09-18.json', 'tempdata/newsarticles\\2017-09-19.json', 'tempdata/newsarticles\\2017-09-20.json', 'tempdata/newsarticles\\2017-09-21.json', 'tempdata/newsarticles\\2017-09-22.json', 'tempdata/newsarticles\\2017-09-23.json', 'tempdata/newsarticles\\2017-09-24.json', 'tempdata/newsarticles\\2017-09-25.json', 'tempdata/newsarticles\\2017-09-26.json', 'tempdata/newsarticles\\2017-09-27.json', 'tempdata/newsarticles\\2017-09-28.json', 'tempdata/newsarticles\\2017-09-29.json', 'tempdata/newsarticles\\2017-09-30.json', 'tempdata/newsarticles\\2017-10-01.json', 'tempdata/newsarticles\\2017-10-02.json', 'tempdata/newsarticles\\2017-10-03.json', 'tempdata/newsarticles\\2017-10-04.json', 'tempdata/newsarticles\\2017-10-05.json', 'tempdata/newsarticles\\2017-10-06.json', 'tempdata/newsarticles\\2017-10-07.json', 'tempdata/newsarticles\\2017-10-08.json', 'tempdata/newsarticles\\2017-10-09.json', 'tempdata/newsarticles\\2017-10-10.json', 'tempdata/newsarticles\\2017-10-11.json', 'tempdata/newsarticles\\2017-10-12.json', 'tempdata/newsarticles\\2017-10-13.json', 'tempdata/newsarticles\\2017-10-14.json', 'tempdata/newsarticles\\2017-10-15.json', 'tempdata/newsarticles\\2017-10-16.json', 'tempdata/newsarticles\\2017-10-17.json', 'tempdata/newsarticles\\2017-10-18.json', 'tempdata/newsarticles\\2017-10-19.json', 'tempdata/newsarticles\\2017-10-20.json', 'tempdata/newsarticles\\2017-10-21.json', 'tempdata/newsarticles\\2017-10-22.json', 'tempdata/newsarticles\\2017-10-23.json', 'tempdata/newsarticles\\2017-10-24.json', 'tempdata/newsarticles\\2017-10-25.json', 'tempdata/newsarticles\\2017-10-26.json', 'tempdata/newsarticles\\2017-10-27.json', 'tempdata/newsarticles\\2017-10-28.json', 'tempdata/newsarticles\\2017-10-29.json', 'tempdata/newsarticles\\2017-10-30.json', 'tempdata/newsarticles\\2017-10-31.json', 'tempdata/newsarticles\\2017-11-01.json', 'tempdata/newsarticles\\2017-11-02.json', 'tempdata/newsarticles\\2017-11-03.json', 'tempdata/newsarticles\\2017-11-04.json', 'tempdata/newsarticles\\2017-11-05.json', 'tempdata/newsarticles\\2017-11-06.json', 'tempdata/newsarticles\\2017-11-07.json', 'tempdata/newsarticles\\2017-11-08.json', 'tempdata/newsarticles\\2017-11-09.json', 'tempdata/newsarticles\\2017-11-10.json', 'tempdata/newsarticles\\2017-11-11.json', 'tempdata/newsarticles\\2017-11-12.json', 'tempdata/newsarticles\\2017-11-13.json', 'tempdata/newsarticles\\2017-11-14.json', 'tempdata/newsarticles\\2017-11-15.json', 'tempdata/newsarticles\\2017-11-16.json', 'tempdata/newsarticles\\2017-11-17.json', 'tempdata/newsarticles\\2017-11-18.json', 'tempdata/newsarticles\\2017-11-19.json', 'tempdata/newsarticles\\2017-11-20.json', 'tempdata/newsarticles\\2017-11-21.json', 'tempdata/newsarticles\\2017-11-22.json', 'tempdata/newsarticles\\2017-11-23.json', 'tempdata/newsarticles\\2017-11-24.json', 'tempdata/newsarticles\\2017-11-25.json', 'tempdata/newsarticles\\2017-11-26.json', 'tempdata/newsarticles\\2017-11-27.json', 'tempdata/newsarticles\\2017-11-28.json', 'tempdata/newsarticles\\2017-11-29.json', 'tempdata/newsarticles\\2017-11-30.json', 'tempdata/newsarticles\\2017-12-01.json', 'tempdata/newsarticles\\2017-12-02.json', 'tempdata/newsarticles\\2017-12-03.json', 'tempdata/newsarticles\\2017-12-04.json', 'tempdata/newsarticles\\2017-12-05.json', 'tempdata/newsarticles\\2017-12-06.json', 'tempdata/newsarticles\\2017-12-07.json', 'tempdata/newsarticles\\2017-12-08.json', 'tempdata/newsarticles\\2017-12-09.json', 'tempdata/newsarticles\\2017-12-10.json', 'tempdata/newsarticles\\2017-12-11.json', 'tempdata/newsarticles\\2017-12-12.json', 'tempdata/newsarticles\\2017-12-13.json', 'tempdata/newsarticles\\2017-12-14.json', 'tempdata/newsarticles\\2017-12-15.json', 'tempdata/newsarticles\\2017-12-16.json', 'tempdata/newsarticles\\2017-12-17.json', 'tempdata/newsarticles\\2017-12-18.json', 'tempdata/newsarticles\\2017-12-19.json', 'tempdata/newsarticles\\2017-12-20.json', 'tempdata/newsarticles\\2017-12-21.json', 'tempdata/newsarticles\\2017-12-22.json', 'tempdata/newsarticles\\2017-12-23.json', 'tempdata/newsarticles\\2017-12-24.json', 'tempdata/newsarticles\\2017-12-25.json', 'tempdata/newsarticles\\2017-12-26.json', 'tempdata/newsarticles\\2017-12-27.json', 'tempdata/newsarticles\\2017-12-28.json', 'tempdata/newsarticles\\2017-12-29.json', 'tempdata/newsarticles\\2017-12-30.json', 'tempdata/newsarticles\\2017-12-31.json', 'tempdata/newsarticles\\2018-01-01.json', 'tempdata/newsarticles\\2018-01-02.json', 'tempdata/newsarticles\\2018-01-03.json', 'tempdata/newsarticles\\2018-01-04.json', 'tempdata/newsarticles\\2018-01-05.json', 'tempdata/newsarticles\\2018-01-06.json', 'tempdata/newsarticles\\2018-01-07.json', 'tempdata/newsarticles\\2018-01-08.json', 'tempdata/newsarticles\\2018-01-09.json', 'tempdata/newsarticles\\2018-01-10.json', 'tempdata/newsarticles\\2018-01-11.json', 'tempdata/newsarticles\\2018-01-12.json', 'tempdata/newsarticles\\2018-01-13.json', 'tempdata/newsarticles\\2018-01-14.json', 'tempdata/newsarticles\\2018-01-15.json', 'tempdata/newsarticles\\2018-01-16.json', 'tempdata/newsarticles\\2018-01-17.json', 'tempdata/newsarticles\\2018-01-18.json', 'tempdata/newsarticles\\2018-01-19.json', 'tempdata/newsarticles\\2018-01-20.json', 'tempdata/newsarticles\\2018-01-21.json', 'tempdata/newsarticles\\2018-01-22.json', 'tempdata/newsarticles\\2018-01-23.json', 'tempdata/newsarticles\\2018-01-24.json', 'tempdata/newsarticles\\2018-01-25.json', 'tempdata/newsarticles\\2018-01-26.json', 'tempdata/newsarticles\\2018-01-27.json', 'tempdata/newsarticles\\2018-01-28.json', 'tempdata/newsarticles\\2018-01-29.json', 'tempdata/newsarticles\\2018-01-30.json', 'tempdata/newsarticles\\2018-01-31.json', 'tempdata/newsarticles\\2018-02-01.json', 'tempdata/newsarticles\\2018-02-02.json', 'tempdata/newsarticles\\2018-02-03.json', 'tempdata/newsarticles\\2018-02-04.json', 'tempdata/newsarticles\\2018-02-05.json', 'tempdata/newsarticles\\2018-02-06.json', 'tempdata/newsarticles\\2018-02-07.json', 'tempdata/newsarticles\\2018-02-08.json', 'tempdata/newsarticles\\2018-02-09.json', 'tempdata/newsarticles\\2018-02-10.json', 'tempdata/newsarticles\\2018-02-11.json', 'tempdata/newsarticles\\2018-02-12.json', 'tempdata/newsarticles\\2018-02-13.json', 'tempdata/newsarticles\\2018-02-14.json', 'tempdata/newsarticles\\2018-02-15.json', 'tempdata/newsarticles\\2018-02-16.json', 'tempdata/newsarticles\\2018-02-17.json', 'tempdata/newsarticles\\2018-02-18.json', 'tempdata/newsarticles\\2018-02-19.json', 'tempdata/newsarticles\\2018-02-20.json', 'tempdata/newsarticles\\2018-02-21.json', 'tempdata/newsarticles\\2018-02-22.json', 'tempdata/newsarticles\\2018-02-23.json', 'tempdata/newsarticles\\2018-02-24.json', 'tempdata/newsarticles\\2018-02-25.json', 'tempdata/newsarticles\\2018-02-26.json', 'tempdata/newsarticles\\2018-02-27.json', 'tempdata/newsarticles\\2018-02-28.json', 'tempdata/newsarticles\\2018-03-01.json', 'tempdata/newsarticles\\2018-03-02.json', 'tempdata/newsarticles\\2018-03-03.json', 'tempdata/newsarticles\\2018-03-04.json', 'tempdata/newsarticles\\2018-03-05.json', 'tempdata/newsarticles\\2018-03-06.json', 'tempdata/newsarticles\\2018-03-07.json', 'tempdata/newsarticles\\2018-03-08.json', 'tempdata/newsarticles\\2018-03-09.json', 'tempdata/newsarticles\\2018-03-10.json', 'tempdata/newsarticles\\2018-03-11.json', 'tempdata/newsarticles\\2018-03-12.json', 'tempdata/newsarticles\\2018-03-13.json', 'tempdata/newsarticles\\2018-03-14.json', 'tempdata/newsarticles\\2018-03-15.json', 'tempdata/newsarticles\\2018-03-16.json', 'tempdata/newsarticles\\2018-03-17.json', 'tempdata/newsarticles\\2018-03-18.json', 'tempdata/newsarticles\\2018-03-19.json', 'tempdata/newsarticles\\2018-03-20.json', 'tempdata/newsarticles\\2018-03-21.json', 'tempdata/newsarticles\\2018-03-22.json', 'tempdata/newsarticles\\2018-03-23.json', 'tempdata/newsarticles\\2018-03-24.json', 'tempdata/newsarticles\\2018-03-25.json', 'tempdata/newsarticles\\2018-03-26.json', 'tempdata/newsarticles\\2018-03-27.json', 'tempdata/newsarticles\\2018-03-28.json', 'tempdata/newsarticles\\2018-03-29.json', 'tempdata/newsarticles\\2018-03-30.json', 'tempdata/newsarticles\\2018-03-31.json', 'tempdata/newsarticles\\2018-04-01.json', 'tempdata/newsarticles\\2018-04-02.json', 'tempdata/newsarticles\\2018-04-03.json', 'tempdata/newsarticles\\2018-04-04.json', 'tempdata/newsarticles\\2018-04-05.json', 'tempdata/newsarticles\\2018-04-06.json', 'tempdata/newsarticles\\2018-04-07.json', 'tempdata/newsarticles\\2018-04-08.json', 'tempdata/newsarticles\\2018-04-09.json', 'tempdata/newsarticles\\2018-04-10.json', 'tempdata/newsarticles\\2018-04-11.json', 'tempdata/newsarticles\\2018-04-12.json', 'tempdata/newsarticles\\2018-04-13.json', 'tempdata/newsarticles\\2018-04-14.json', 'tempdata/newsarticles\\2018-04-15.json', 'tempdata/newsarticles\\2018-04-16.json', 'tempdata/newsarticles\\2018-04-17.json', 'tempdata/newsarticles\\2018-04-18.json', 'tempdata/newsarticles\\2018-04-19.json', 'tempdata/newsarticles\\2018-04-20.json', 'tempdata/newsarticles\\2018-04-21.json', 'tempdata/newsarticles\\2018-04-22.json', 'tempdata/newsarticles\\2018-04-23.json', 'tempdata/newsarticles\\2018-04-24.json', 'tempdata/newsarticles\\2018-04-25.json', 'tempdata/newsarticles\\2018-04-26.json', 'tempdata/newsarticles\\2018-04-27.json', 'tempdata/newsarticles\\2018-04-28.json', 'tempdata/newsarticles\\2018-04-29.json', 'tempdata/newsarticles\\2018-04-30.json', 'tempdata/newsarticles\\2018-05-01.json', 'tempdata/newsarticles\\2018-05-02.json', 'tempdata/newsarticles\\2018-05-03.json', 'tempdata/newsarticles\\2018-05-04.json', 'tempdata/newsarticles\\2018-05-05.json', 'tempdata/newsarticles\\2018-05-06.json', 'tempdata/newsarticles\\2018-05-07.json', 'tempdata/newsarticles\\2018-05-08.json', 'tempdata/newsarticles\\2018-05-09.json', 'tempdata/newsarticles\\2018-05-10.json', 'tempdata/newsarticles\\2018-05-11.json', 'tempdata/newsarticles\\2018-05-12.json', 'tempdata/newsarticles\\2018-05-13.json', 'tempdata/newsarticles\\2018-05-14.json', 'tempdata/newsarticles\\2018-05-15.json', 'tempdata/newsarticles\\2018-05-16.json', 'tempdata/newsarticles\\2018-05-17.json', 'tempdata/newsarticles\\2018-05-18.json', 'tempdata/newsarticles\\2018-05-19.json', 'tempdata/newsarticles\\2018-05-20.json', 'tempdata/newsarticles\\2018-05-21.json', 'tempdata/newsarticles\\2018-05-22.json', 'tempdata/newsarticles\\2018-05-23.json', 'tempdata/newsarticles\\2018-05-24.json', 'tempdata/newsarticles\\2018-05-25.json', 'tempdata/newsarticles\\2018-05-26.json', 'tempdata/newsarticles\\2018-05-27.json', 'tempdata/newsarticles\\2018-05-28.json', 'tempdata/newsarticles\\2018-05-29.json', 'tempdata/newsarticles\\2018-05-30.json', 'tempdata/newsarticles\\2018-05-31.json', 'tempdata/newsarticles\\2018-06-01.json', 'tempdata/newsarticles\\2018-06-02.json', 'tempdata/newsarticles\\2018-06-03.json', 'tempdata/newsarticles\\2018-06-04.json', 'tempdata/newsarticles\\2018-06-05.json', 'tempdata/newsarticles\\2018-06-06.json', 'tempdata/newsarticles\\2018-06-07.json', 'tempdata/newsarticles\\2018-06-08.json', 'tempdata/newsarticles\\2018-06-09.json', 'tempdata/newsarticles\\2018-06-10.json', 'tempdata/newsarticles\\2018-06-11.json', 'tempdata/newsarticles\\2018-06-12.json', 'tempdata/newsarticles\\2018-06-13.json', 'tempdata/newsarticles\\2018-06-14.json', 'tempdata/newsarticles\\2018-06-15.json', 'tempdata/newsarticles\\2018-06-16.json', 'tempdata/newsarticles\\2018-06-17.json', 'tempdata/newsarticles\\2018-06-18.json', 'tempdata/newsarticles\\2018-06-19.json', 'tempdata/newsarticles\\2018-06-20.json', 'tempdata/newsarticles\\2018-06-21.json', 'tempdata/newsarticles\\2018-06-22.json', 'tempdata/newsarticles\\2018-06-23.json', 'tempdata/newsarticles\\2018-06-24.json', 'tempdata/newsarticles\\2018-06-25.json', 'tempdata/newsarticles\\2018-06-26.json', 'tempdata/newsarticles\\2018-06-27.json', 'tempdata/newsarticles\\2018-06-28.json', 'tempdata/newsarticles\\2018-06-29.json', 'tempdata/newsarticles\\2018-06-30.json', 'tempdata/newsarticles\\2018-07-01.json', 'tempdata/newsarticles\\2018-07-02.json', 'tempdata/newsarticles\\2018-07-03.json', 'tempdata/newsarticles\\2018-07-04.json', 'tempdata/newsarticles\\2018-07-05.json', 'tempdata/newsarticles\\2018-07-06.json', 'tempdata/newsarticles\\2018-07-07.json', 'tempdata/newsarticles\\2018-07-08.json', 'tempdata/newsarticles\\2018-07-09.json', 'tempdata/newsarticles\\2018-07-10.json', 'tempdata/newsarticles\\2018-07-11.json', 'tempdata/newsarticles\\2018-07-12.json', 'tempdata/newsarticles\\2018-07-13.json', 'tempdata/newsarticles\\2018-07-14.json', 'tempdata/newsarticles\\2018-07-15.json', 'tempdata/newsarticles\\2018-07-16.json', 'tempdata/newsarticles\\2018-07-17.json', 'tempdata/newsarticles\\2018-07-18.json', 'tempdata/newsarticles\\2018-07-19.json', 'tempdata/newsarticles\\2018-07-20.json', 'tempdata/newsarticles\\2018-07-21.json', 'tempdata/newsarticles\\2018-07-22.json', 'tempdata/newsarticles\\2018-07-23.json', 'tempdata/newsarticles\\2018-07-24.json', 'tempdata/newsarticles\\2018-07-25.json', 'tempdata/newsarticles\\2018-07-26.json', 'tempdata/newsarticles\\2018-07-27.json', 'tempdata/newsarticles\\2018-07-28.json', 'tempdata/newsarticles\\2018-07-29.json', 'tempdata/newsarticles\\2018-07-30.json', 'tempdata/newsarticles\\2018-07-31.json', 'tempdata/newsarticles\\2018-08-01.json', 'tempdata/newsarticles\\2018-08-02.json', 'tempdata/newsarticles\\2018-08-03.json', 'tempdata/newsarticles\\2018-08-04.json', 'tempdata/newsarticles\\2018-08-05.json', 'tempdata/newsarticles\\2018-08-06.json', 'tempdata/newsarticles\\2018-08-07.json', 'tempdata/newsarticles\\2018-08-08.json', 'tempdata/newsarticles\\2018-08-09.json', 'tempdata/newsarticles\\2018-08-10.json', 'tempdata/newsarticles\\2018-08-11.json', 'tempdata/newsarticles\\2018-08-12.json', 'tempdata/newsarticles\\2018-08-13.json', 'tempdata/newsarticles\\2018-08-14.json', 'tempdata/newsarticles\\2018-08-15.json', 'tempdata/newsarticles\\2018-08-16.json', 'tempdata/newsarticles\\2018-08-17.json', 'tempdata/newsarticles\\2018-08-18.json', 'tempdata/newsarticles\\2018-08-19.json', 'tempdata/newsarticles\\2018-08-20.json', 'tempdata/newsarticles\\2018-08-21.json', 'tempdata/newsarticles\\2018-08-22.json', 'tempdata/newsarticles\\2018-08-23.json', 'tempdata/newsarticles\\2018-08-24.json', 'tempdata/newsarticles\\2018-08-25.json', 'tempdata/newsarticles\\2018-08-26.json', 'tempdata/newsarticles\\2018-08-27.json', 'tempdata/newsarticles\\2018-08-28.json', 'tempdata/newsarticles\\2018-08-29.json', 'tempdata/newsarticles\\2018-08-30.json', 'tempdata/newsarticles\\2018-08-31.json', 'tempdata/newsarticles\\2018-09-01.json', 'tempdata/newsarticles\\2018-09-02.json', 'tempdata/newsarticles\\2018-09-03.json', 'tempdata/newsarticles\\2018-09-04.json', 'tempdata/newsarticles\\2018-09-05.json', 'tempdata/newsarticles\\2018-09-06.json', 'tempdata/newsarticles\\2018-09-07.json', 'tempdata/newsarticles\\2018-09-08.json', 'tempdata/newsarticles\\2018-09-09.json', 'tempdata/newsarticles\\2018-09-10.json', 'tempdata/newsarticles\\2018-09-11.json', 'tempdata/newsarticles\\2018-09-12.json', 'tempdata/newsarticles\\2018-09-13.json', 'tempdata/newsarticles\\2018-09-14.json', 'tempdata/newsarticles\\2018-09-15.json', 'tempdata/newsarticles\\2018-09-16.json', 'tempdata/newsarticles\\2018-09-17.json', 'tempdata/newsarticles\\2018-09-18.json', 'tempdata/newsarticles\\2018-09-19.json', 'tempdata/newsarticles\\2018-09-20.json', 'tempdata/newsarticles\\2018-09-21.json', 'tempdata/newsarticles\\2018-09-22.json', 'tempdata/newsarticles\\2018-09-23.json', 'tempdata/newsarticles\\2018-09-24.json', 'tempdata/newsarticles\\2018-09-25.json', 'tempdata/newsarticles\\2018-09-26.json', 'tempdata/newsarticles\\2018-09-27.json', 'tempdata/newsarticles\\2018-09-28.json', 'tempdata/newsarticles\\2018-09-29.json', 'tempdata/newsarticles\\2018-09-30.json', 'tempdata/newsarticles\\2018-10-01.json', 'tempdata/newsarticles\\2018-10-02.json', 'tempdata/newsarticles\\2018-10-03.json', 'tempdata/newsarticles\\2018-10-04.json', 'tempdata/newsarticles\\2018-10-05.json', 'tempdata/newsarticles\\2018-10-06.json', 'tempdata/newsarticles\\2018-10-07.json', 'tempdata/newsarticles\\2018-10-08.json', 'tempdata/newsarticles\\2018-10-09.json', 'tempdata/newsarticles\\2018-10-10.json', 'tempdata/newsarticles\\2018-10-11.json', 'tempdata/newsarticles\\2018-10-12.json', 'tempdata/newsarticles\\2018-10-13.json', 'tempdata/newsarticles\\2018-10-14.json', 'tempdata/newsarticles\\2018-10-15.json', 'tempdata/newsarticles\\2018-10-16.json', 'tempdata/newsarticles\\2018-10-17.json', 'tempdata/newsarticles\\2018-10-18.json', 'tempdata/newsarticles\\2018-10-19.json', 'tempdata/newsarticles\\2018-10-20.json', 'tempdata/newsarticles\\2018-10-21.json', 'tempdata/newsarticles\\2018-10-22.json', 'tempdata/newsarticles\\2018-10-23.json', 'tempdata/newsarticles\\2018-10-24.json', 'tempdata/newsarticles\\2018-10-25.json', 'tempdata/newsarticles\\2018-10-26.json', 'tempdata/newsarticles\\2018-10-27.json', 'tempdata/newsarticles\\2018-10-28.json', 'tempdata/newsarticles\\2018-10-29.json', 'tempdata/newsarticles\\2018-10-30.json', 'tempdata/newsarticles\\2018-10-31.json', 'tempdata/newsarticles\\2018-11-01.json', 'tempdata/newsarticles\\2018-11-02.json', 'tempdata/newsarticles\\2018-11-03.json', 'tempdata/newsarticles\\2018-11-04.json', 'tempdata/newsarticles\\2018-11-05.json', 'tempdata/newsarticles\\2018-11-06.json', 'tempdata/newsarticles\\2018-11-07.json', 'tempdata/newsarticles\\2018-11-08.json', 'tempdata/newsarticles\\2018-11-09.json', 'tempdata/newsarticles\\2018-11-10.json', 'tempdata/newsarticles\\2018-11-11.json', 'tempdata/newsarticles\\2018-11-12.json', 'tempdata/newsarticles\\2018-11-13.json', 'tempdata/newsarticles\\2018-11-14.json', 'tempdata/newsarticles\\2018-11-15.json', 'tempdata/newsarticles\\2018-11-16.json', 'tempdata/newsarticles\\2018-11-17.json', 'tempdata/newsarticles\\2018-11-18.json', 'tempdata/newsarticles\\2018-11-19.json', 'tempdata/newsarticles\\2018-11-20.json', 'tempdata/newsarticles\\2018-11-21.json', 'tempdata/newsarticles\\2018-11-22.json', 'tempdata/newsarticles\\2018-11-23.json', 'tempdata/newsarticles\\2018-11-24.json', 'tempdata/newsarticles\\2018-11-25.json', 'tempdata/newsarticles\\2018-11-26.json', 'tempdata/newsarticles\\2018-11-27.json', 'tempdata/newsarticles\\2018-11-28.json', 'tempdata/newsarticles\\2018-11-29.json', 'tempdata/newsarticles\\2018-11-30.json', 'tempdata/newsarticles\\2018-12-01.json', 'tempdata/newsarticles\\2018-12-02.json', 'tempdata/newsarticles\\2018-12-03.json', 'tempdata/newsarticles\\2018-12-04.json', 'tempdata/newsarticles\\2018-12-05.json', 'tempdata/newsarticles\\2018-12-06.json', 'tempdata/newsarticles\\2018-12-07.json', 'tempdata/newsarticles\\2018-12-08.json', 'tempdata/newsarticles\\2018-12-09.json', 'tempdata/newsarticles\\2018-12-10.json', 'tempdata/newsarticles\\2018-12-11.json', 'tempdata/newsarticles\\2018-12-12.json', 'tempdata/newsarticles\\2018-12-13.json', 'tempdata/newsarticles\\2018-12-14.json', 'tempdata/newsarticles\\2018-12-15.json', 'tempdata/newsarticles\\2018-12-16.json', 'tempdata/newsarticles\\2018-12-17.json', 'tempdata/newsarticles\\2018-12-18.json', 'tempdata/newsarticles\\2018-12-19.json', 'tempdata/newsarticles\\2018-12-20.json', 'tempdata/newsarticles\\2018-12-21.json', 'tempdata/newsarticles\\2018-12-22.json', 'tempdata/newsarticles\\2018-12-23.json', 'tempdata/newsarticles\\2018-12-24.json', 'tempdata/newsarticles\\2018-12-25.json', 'tempdata/newsarticles\\2018-12-26.json', 'tempdata/newsarticles\\2018-12-27.json', 'tempdata/newsarticles\\2018-12-28.json', 'tempdata/newsarticles\\2018-12-29.json', 'tempdata/newsarticles\\2018-12-30.json', 'tempdata/newsarticles\\2018-12-31.json', 'tempdata/newsarticles\\2019-01-01.json', 'tempdata/newsarticles\\2019-01-02.json', 'tempdata/newsarticles\\2019-01-03.json', 'tempdata/newsarticles\\2019-01-04.json', 'tempdata/newsarticles\\2019-01-05.json', 'tempdata/newsarticles\\2019-01-06.json', 'tempdata/newsarticles\\2019-01-07.json', 'tempdata/newsarticles\\2019-01-08.json', 'tempdata/newsarticles\\2019-01-09.json', 'tempdata/newsarticles\\2019-01-10.json', 'tempdata/newsarticles\\2019-01-11.json', 'tempdata/newsarticles\\2019-01-12.json', 'tempdata/newsarticles\\2019-01-13.json', 'tempdata/newsarticles\\2019-01-14.json', 'tempdata/newsarticles\\2019-01-15.json', 'tempdata/newsarticles\\2019-01-16.json', 'tempdata/newsarticles\\2019-01-17.json', 'tempdata/newsarticles\\2019-01-18.json', 'tempdata/newsarticles\\2019-01-19.json', 'tempdata/newsarticles\\2019-01-20.json', 'tempdata/newsarticles\\2019-01-21.json', 'tempdata/newsarticles\\2019-01-22.json', 'tempdata/newsarticles\\2019-01-23.json', 'tempdata/newsarticles\\2019-01-24.json', 'tempdata/newsarticles\\2019-01-25.json', 'tempdata/newsarticles\\2019-01-26.json', 'tempdata/newsarticles\\2019-01-27.json', 'tempdata/newsarticles\\2019-01-28.json', 'tempdata/newsarticles\\2019-01-29.json', 'tempdata/newsarticles\\2019-01-30.json', 'tempdata/newsarticles\\2019-01-31.json', 'tempdata/newsarticles\\2019-02-01.json', 'tempdata/newsarticles\\2019-02-02.json', 'tempdata/newsarticles\\2019-02-03.json', 'tempdata/newsarticles\\2019-02-04.json', 'tempdata/newsarticles\\2019-02-05.json', 'tempdata/newsarticles\\2019-02-06.json', 'tempdata/newsarticles\\2019-02-07.json', 'tempdata/newsarticles\\2019-02-08.json', 'tempdata/newsarticles\\2019-02-09.json', 'tempdata/newsarticles\\2019-02-10.json', 'tempdata/newsarticles\\2019-02-11.json', 'tempdata/newsarticles\\2019-02-12.json', 'tempdata/newsarticles\\2019-02-13.json', 'tempdata/newsarticles\\2019-02-14.json', 'tempdata/newsarticles\\2019-02-15.json', 'tempdata/newsarticles\\2019-02-16.json', 'tempdata/newsarticles\\2019-02-17.json', 'tempdata/newsarticles\\2019-02-18.json', 'tempdata/newsarticles\\2019-02-19.json', 'tempdata/newsarticles\\2019-02-20.json', 'tempdata/newsarticles\\2019-02-21.json', 'tempdata/newsarticles\\2019-02-22.json', 'tempdata/newsarticles\\2019-02-23.json', 'tempdata/newsarticles\\2019-02-24.json', 'tempdata/newsarticles\\2019-02-25.json', 'tempdata/newsarticles\\2019-02-26.json', 'tempdata/newsarticles\\2019-02-27.json', 'tempdata/newsarticles\\2019-02-28.json', 'tempdata/newsarticles\\2019-03-01.json', 'tempdata/newsarticles\\2019-03-02.json', 'tempdata/newsarticles\\2019-03-03.json', 'tempdata/newsarticles\\2019-03-04.json', 'tempdata/newsarticles\\2019-03-05.json', 'tempdata/newsarticles\\2019-03-06.json', 'tempdata/newsarticles\\2019-03-07.json', 'tempdata/newsarticles\\2019-03-08.json', 'tempdata/newsarticles\\2019-03-09.json', 'tempdata/newsarticles\\2019-03-10.json', 'tempdata/newsarticles\\2019-03-11.json', 'tempdata/newsarticles\\2019-03-12.json', 'tempdata/newsarticles\\2019-03-13.json', 'tempdata/newsarticles\\2019-03-14.json', 'tempdata/newsarticles\\2019-03-15.json', 'tempdata/newsarticles\\2019-03-16.json', 'tempdata/newsarticles\\2019-03-17.json', 'tempdata/newsarticles\\2019-03-18.json', 'tempdata/newsarticles\\2019-03-19.json', 'tempdata/newsarticles\\2019-03-20.json', 'tempdata/newsarticles\\2019-03-21.json', 'tempdata/newsarticles\\2019-03-22.json', 'tempdata/newsarticles\\2019-03-23.json', 'tempdata/newsarticles\\2019-03-24.json', 'tempdata/newsarticles\\2019-03-25.json', 'tempdata/newsarticles\\2019-03-26.json', 'tempdata/newsarticles\\2019-03-27.json', 'tempdata/newsarticles\\2019-03-28.json', 'tempdata/newsarticles\\2019-03-29.json', 'tempdata/newsarticles\\2019-03-30.json', 'tempdata/newsarticles\\2019-03-31.json', 'tempdata/newsarticles\\2019-04-01.json', 'tempdata/newsarticles\\2019-04-02.json', 'tempdata/newsarticles\\2019-04-03.json', 'tempdata/newsarticles\\2019-04-04.json', 'tempdata/newsarticles\\2019-04-05.json', 'tempdata/newsarticles\\2019-04-06.json', 'tempdata/newsarticles\\2019-04-07.json', 'tempdata/newsarticles\\2019-04-08.json', 'tempdata/newsarticles\\2019-04-09.json', 'tempdata/newsarticles\\2019-04-10.json', 'tempdata/newsarticles\\2019-04-11.json', 'tempdata/newsarticles\\2019-04-12.json', 'tempdata/newsarticles\\2019-04-13.json', 'tempdata/newsarticles\\2019-04-14.json', 'tempdata/newsarticles\\2019-04-15.json', 'tempdata/newsarticles\\2019-04-16.json', 'tempdata/newsarticles\\2019-04-17.json', 'tempdata/newsarticles\\2019-04-18.json', 'tempdata/newsarticles\\2019-04-19.json', 'tempdata/newsarticles\\2019-04-20.json', 'tempdata/newsarticles\\2019-04-21.json', 'tempdata/newsarticles\\2019-04-22.json', 'tempdata/newsarticles\\2019-04-23.json', 'tempdata/newsarticles\\2019-04-24.json', 'tempdata/newsarticles\\2019-04-25.json', 'tempdata/newsarticles\\2019-04-26.json', 'tempdata/newsarticles\\2019-04-27.json', 'tempdata/newsarticles\\2019-04-28.json', 'tempdata/newsarticles\\2019-04-29.json', 'tempdata/newsarticles\\2019-04-30.json', 'tempdata/newsarticles\\2019-05-01.json', 'tempdata/newsarticles\\2019-05-02.json', 'tempdata/newsarticles\\2019-05-03.json', 'tempdata/newsarticles\\2019-05-04.json', 'tempdata/newsarticles\\2019-05-05.json', 'tempdata/newsarticles\\2019-05-06.json', 'tempdata/newsarticles\\2019-05-07.json', 'tempdata/newsarticles\\2019-05-08.json', 'tempdata/newsarticles\\2019-05-09.json', 'tempdata/newsarticles\\2019-05-10.json', 'tempdata/newsarticles\\2019-05-11.json', 'tempdata/newsarticles\\2019-05-12.json', 'tempdata/newsarticles\\2019-05-13.json', 'tempdata/newsarticles\\2019-05-14.json', 'tempdata/newsarticles\\2019-05-15.json', 'tempdata/newsarticles\\2019-05-16.json', 'tempdata/newsarticles\\2019-05-17.json', 'tempdata/newsarticles\\2019-05-18.json', 'tempdata/newsarticles\\2019-05-19.json', 'tempdata/newsarticles\\2019-05-20.json', 'tempdata/newsarticles\\2019-05-21.json', 'tempdata/newsarticles\\2019-05-22.json', 'tempdata/newsarticles\\2019-05-23.json', 'tempdata/newsarticles\\2019-05-24.json', 'tempdata/newsarticles\\2019-05-25.json', 'tempdata/newsarticles\\2019-05-26.json', 'tempdata/newsarticles\\2019-05-27.json', 'tempdata/newsarticles\\2019-05-28.json', 'tempdata/newsarticles\\2019-05-29.json', 'tempdata/newsarticles\\2019-05-30.json', 'tempdata/newsarticles\\2019-05-31.json', 'tempdata/newsarticles\\2019-06-01.json', 'tempdata/newsarticles\\2019-06-02.json', 'tempdata/newsarticles\\2019-06-03.json', 'tempdata/newsarticles\\2019-06-04.json', 'tempdata/newsarticles\\2019-06-05.json', 'tempdata/newsarticles\\2019-06-06.json', 'tempdata/newsarticles\\2019-06-07.json', 'tempdata/newsarticles\\2019-06-08.json', 'tempdata/newsarticles\\2019-06-09.json', 'tempdata/newsarticles\\2019-06-10.json', 'tempdata/newsarticles\\2019-06-11.json', 'tempdata/newsarticles\\2019-06-12.json', 'tempdata/newsarticles\\2019-06-13.json', 'tempdata/newsarticles\\2019-06-14.json', 'tempdata/newsarticles\\2019-06-15.json', 'tempdata/newsarticles\\2019-06-16.json', 'tempdata/newsarticles\\2019-06-17.json', 'tempdata/newsarticles\\2019-06-18.json', 'tempdata/newsarticles\\2019-06-19.json', 'tempdata/newsarticles\\2019-06-20.json', 'tempdata/newsarticles\\2019-06-21.json', 'tempdata/newsarticles\\2019-06-22.json', 'tempdata/newsarticles\\2019-06-23.json', 'tempdata/newsarticles\\2019-06-24.json', 'tempdata/newsarticles\\2019-06-25.json', 'tempdata/newsarticles\\2019-06-26.json', 'tempdata/newsarticles\\2019-06-27.json', 'tempdata/newsarticles\\2019-06-28.json', 'tempdata/newsarticles\\2019-06-29.json', 'tempdata/newsarticles\\2019-06-30.json', 'tempdata/newsarticles\\2019-07-01.json', 'tempdata/newsarticles\\2019-07-02.json', 'tempdata/newsarticles\\2019-07-03.json', 'tempdata/newsarticles\\2019-07-04.json', 'tempdata/newsarticles\\2019-07-05.json', 'tempdata/newsarticles\\2019-07-06.json', 'tempdata/newsarticles\\2019-07-07.json', 'tempdata/newsarticles\\2019-07-08.json', 'tempdata/newsarticles\\2019-07-09.json', 'tempdata/newsarticles\\2019-07-10.json', 'tempdata/newsarticles\\2019-07-11.json', 'tempdata/newsarticles\\2019-07-12.json', 'tempdata/newsarticles\\2019-07-13.json', 'tempdata/newsarticles\\2019-07-14.json', 'tempdata/newsarticles\\2019-07-15.json', 'tempdata/newsarticles\\2019-07-16.json', 'tempdata/newsarticles\\2019-07-17.json', 'tempdata/newsarticles\\2019-07-18.json', 'tempdata/newsarticles\\2019-07-19.json', 'tempdata/newsarticles\\2019-07-20.json', 'tempdata/newsarticles\\2019-07-21.json', 'tempdata/newsarticles\\2019-07-22.json', 'tempdata/newsarticles\\2019-07-23.json', 'tempdata/newsarticles\\2019-07-24.json', 'tempdata/newsarticles\\2019-07-25.json', 'tempdata/newsarticles\\2019-07-26.json', 'tempdata/newsarticles\\2019-07-27.json', 'tempdata/newsarticles\\2019-07-28.json', 'tempdata/newsarticles\\2019-07-29.json', 'tempdata/newsarticles\\2019-07-30.json', 'tempdata/newsarticles\\2019-07-31.json', 'tempdata/newsarticles\\2019-08-01.json', 'tempdata/newsarticles\\2019-08-02.json', 'tempdata/newsarticles\\2019-08-03.json', 'tempdata/newsarticles\\2019-08-04.json', 'tempdata/newsarticles\\2019-08-05.json', 'tempdata/newsarticles\\2019-08-06.json', 'tempdata/newsarticles\\2019-08-07.json', 'tempdata/newsarticles\\2019-08-08.json', 'tempdata/newsarticles\\2019-08-09.json', 'tempdata/newsarticles\\2019-08-10.json', 'tempdata/newsarticles\\2019-08-11.json', 'tempdata/newsarticles\\2019-08-12.json', 'tempdata/newsarticles\\2019-08-13.json', 'tempdata/newsarticles\\2019-08-14.json', 'tempdata/newsarticles\\2019-08-15.json', 'tempdata/newsarticles\\2019-08-16.json', 'tempdata/newsarticles\\2019-08-17.json', 'tempdata/newsarticles\\2019-08-18.json', 'tempdata/newsarticles\\2019-08-19.json', 'tempdata/newsarticles\\2019-08-20.json', 'tempdata/newsarticles\\2019-08-21.json', 'tempdata/newsarticles\\2019-08-22.json', 'tempdata/newsarticles\\2019-08-23.json', 'tempdata/newsarticles\\2019-08-24.json', 'tempdata/newsarticles\\2019-08-25.json', 'tempdata/newsarticles\\2019-08-26.json', 'tempdata/newsarticles\\2019-08-27.json', 'tempdata/newsarticles\\2019-08-28.json', 'tempdata/newsarticles\\2019-08-29.json', 'tempdata/newsarticles\\2019-08-30.json', 'tempdata/newsarticles\\2019-08-31.json', 'tempdata/newsarticles\\2019-09-01.json', 'tempdata/newsarticles\\2019-09-02.json', 'tempdata/newsarticles\\2019-09-03.json', 'tempdata/newsarticles\\2019-09-04.json', 'tempdata/newsarticles\\2019-09-05.json', 'tempdata/newsarticles\\2019-09-06.json', 'tempdata/newsarticles\\2019-09-07.json', 'tempdata/newsarticles\\2019-09-08.json', 'tempdata/newsarticles\\2019-09-09.json', 'tempdata/newsarticles\\2019-09-10.json', 'tempdata/newsarticles\\2019-09-11.json', 'tempdata/newsarticles\\2019-09-12.json', 'tempdata/newsarticles\\2019-09-13.json', 'tempdata/newsarticles\\2019-09-14.json', 'tempdata/newsarticles\\2019-09-15.json', 'tempdata/newsarticles\\2019-09-16.json', 'tempdata/newsarticles\\2019-09-17.json', 'tempdata/newsarticles\\2019-09-18.json', 'tempdata/newsarticles\\2019-09-19.json', 'tempdata/newsarticles\\2019-09-20.json', 'tempdata/newsarticles\\2019-09-21.json', 'tempdata/newsarticles\\2019-09-22.json', 'tempdata/newsarticles\\2019-09-23.json', 'tempdata/newsarticles\\2019-09-24.json', 'tempdata/newsarticles\\2019-09-25.json', 'tempdata/newsarticles\\2019-09-26.json', 'tempdata/newsarticles\\2019-09-27.json', 'tempdata/newsarticles\\2019-09-28.json', 'tempdata/newsarticles\\2019-09-29.json', 'tempdata/newsarticles\\2019-09-30.json', 'tempdata/newsarticles\\2019-10-01.json', 'tempdata/newsarticles\\2019-10-02.json', 'tempdata/newsarticles\\2019-10-03.json', 'tempdata/newsarticles\\2019-10-04.json', 'tempdata/newsarticles\\2019-10-05.json', 'tempdata/newsarticles\\2019-10-06.json', 'tempdata/newsarticles\\2019-10-07.json', 'tempdata/newsarticles\\2019-10-08.json', 'tempdata/newsarticles\\2019-10-09.json', 'tempdata/newsarticles\\2019-10-10.json', 'tempdata/newsarticles\\2019-10-11.json', 'tempdata/newsarticles\\2019-10-12.json', 'tempdata/newsarticles\\2019-10-13.json', 'tempdata/newsarticles\\2019-10-14.json', 'tempdata/newsarticles\\2019-10-15.json', 'tempdata/newsarticles\\2019-10-16.json', 'tempdata/newsarticles\\2019-10-17.json', 'tempdata/newsarticles\\2019-10-18.json', 'tempdata/newsarticles\\2019-10-19.json', 'tempdata/newsarticles\\2019-10-20.json', 'tempdata/newsarticles\\2019-10-21.json', 'tempdata/newsarticles\\2019-10-22.json', 'tempdata/newsarticles\\2019-10-23.json', 'tempdata/newsarticles\\2019-10-24.json', 'tempdata/newsarticles\\2019-10-25.json', 'tempdata/newsarticles\\2019-10-26.json', 'tempdata/newsarticles\\2019-10-27.json', 'tempdata/newsarticles\\2019-10-28.json', 'tempdata/newsarticles\\2019-10-29.json', 'tempdata/newsarticles\\2019-10-30.json', 'tempdata/newsarticles\\2019-10-31.json', 'tempdata/newsarticles\\2019-11-01.json', 'tempdata/newsarticles\\2019-11-02.json', 'tempdata/newsarticles\\2019-11-03.json', 'tempdata/newsarticles\\2019-11-04.json', 'tempdata/newsarticles\\2019-11-05.json', 'tempdata/newsarticles\\2019-11-06.json', 'tempdata/newsarticles\\2019-11-07.json', 'tempdata/newsarticles\\2019-11-08.json', 'tempdata/newsarticles\\2019-11-09.json', 'tempdata/newsarticles\\2019-11-10.json', 'tempdata/newsarticles\\2019-11-11.json', 'tempdata/newsarticles\\2019-11-12.json', 'tempdata/newsarticles\\2019-11-13.json', 'tempdata/newsarticles\\2019-11-14.json', 'tempdata/newsarticles\\2019-11-15.json', 'tempdata/newsarticles\\2019-11-16.json', 'tempdata/newsarticles\\2019-11-17.json', 'tempdata/newsarticles\\2019-11-18.json', 'tempdata/newsarticles\\2019-11-19.json', 'tempdata/newsarticles\\2019-11-20.json', 'tempdata/newsarticles\\2019-11-21.json', 'tempdata/newsarticles\\2019-11-22.json', 'tempdata/newsarticles\\2019-11-23.json', 'tempdata/newsarticles\\2019-11-24.json', 'tempdata/newsarticles\\2019-11-25.json', 'tempdata/newsarticles\\2019-11-26.json', 'tempdata/newsarticles\\2019-11-27.json', 'tempdata/newsarticles\\2019-11-28.json', 'tempdata/newsarticles\\2019-11-29.json', 'tempdata/newsarticles\\2019-11-30.json', 'tempdata/newsarticles\\2019-12-01.json', 'tempdata/newsarticles\\2019-12-02.json', 'tempdata/newsarticles\\2019-12-03.json', 'tempdata/newsarticles\\2019-12-04.json', 'tempdata/newsarticles\\2019-12-05.json', 'tempdata/newsarticles\\2019-12-06.json', 'tempdata/newsarticles\\2019-12-07.json', 'tempdata/newsarticles\\2019-12-08.json', 'tempdata/newsarticles\\2019-12-09.json', 'tempdata/newsarticles\\2019-12-10.json', 'tempdata/newsarticles\\2019-12-11.json', 'tempdata/newsarticles\\2019-12-12.json', 'tempdata/newsarticles\\2019-12-13.json', 'tempdata/newsarticles\\2019-12-14.json', 'tempdata/newsarticles\\2019-12-15.json', 'tempdata/newsarticles\\2019-12-16.json', 'tempdata/newsarticles\\2019-12-17.json', 'tempdata/newsarticles\\2019-12-18.json', 'tempdata/newsarticles\\2019-12-19.json', 'tempdata/newsarticles\\2019-12-20.json', 'tempdata/newsarticles\\2019-12-21.json', 'tempdata/newsarticles\\2019-12-22.json', 'tempdata/newsarticles\\2019-12-23.json', 'tempdata/newsarticles\\2019-12-24.json', 'tempdata/newsarticles\\2019-12-25.json', 'tempdata/newsarticles\\2019-12-26.json', 'tempdata/newsarticles\\2019-12-27.json', 'tempdata/newsarticles\\2019-12-28.json', 'tempdata/newsarticles\\2019-12-29.json', 'tempdata/newsarticles\\2019-12-30.json', 'tempdata/newsarticles\\2019-12-31.json']
In [15]:
json2 = pd.read_json(json_files[1])
In [16]:
json2
Out[16]:
id type sectionId sectionName webPublicationDate webTitle webUrl apiUrl fields isHosted pillarId pillarName
0 artanddesign/2016/jan/02/your-pictures-share-y... article artanddesign Art and design 2016-01-02T22:35:19Z Your pictures: share your photos on the theme ... https://www.theguardian.com/artanddesign/2016/... https://content.guardianapis.com/artanddesign/... {'headline': 'Your pictures: share your photos... False pillar/arts Arts
1 football/live/2016/jan/02/watford-v-manchester... liveblog football Football 2016-01-02T19:25:27Z Watford v Manchester City: Premier League – as... https://www.theguardian.com/football/live/2016... https://content.guardianapis.com/football/live... {'headline': 'Watford v Manchester City: Premi... False pillar/sport Sport
2 us-news/2016/jan/02/new-jersey-newspaper-edito... article us-news US news 2016-01-02T18:58:22Z Two New Jersey newspapers denounce Chris Chris... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'Two New Jersey newspapers denoun... False pillar/news News
3 football/2016/jan/02/steve-gohouri-found-dead-... article football Football 2016-01-02T18:41:38Z Former Ivory Coast defender Steve Gohouri foun... https://www.theguardian.com/football/2016/jan/... https://content.guardianapis.com/football/2016... {'headline': 'Former Ivory Coast defender Stev... False pillar/sport Sport
4 football/blog/2016/jan/02/anthony-martial-wayn... article football Football 2016-01-02T18:14:36Z Anthony Martial and Wayne Rooney prove Manches... https://www.theguardian.com/football/blog/2016... https://content.guardianapis.com/football/blog... {'headline': 'Anthony Martial and Wayne Rooney... False pillar/sport Sport
5 football/live/2016/jan/02/manchester-united-v-... liveblog football Football 2016-01-02T17:40:17Z Manchester United 2-1 Swansea, Espanyol 0-0 Ba... https://www.theguardian.com/football/live/2016... https://content.guardianapis.com/football/live... {'headline': 'Manchester United 2-1 Swansea, E... False pillar/sport Sport
6 world/live/2016/jan/02/middle-east-condemns-sa... liveblog world World news 2016-01-02T16:55:37Z Execution of Shia cleric sparks international ... https://www.theguardian.com/world/live/2016/ja... https://content.guardianapis.com/world/live/20... {'headline': 'Execution of Shia cleric sparks ... False pillar/news News
7 sport/2016/jan/02/lizzie-kelly-tea-for-two-box... article sport Sport 2016-01-02T16:50:08Z Lizzie Kelly: The only person who really belie... https://www.theguardian.com/sport/2016/jan/02/... https://content.guardianapis.com/sport/2016/ja... {'headline': 'Lizzie Kelly: The only person wh... False pillar/sport Sport
8 sport/live/2016/jan/02/south-africa-v-england-... liveblog sport Sport 2016-01-02T16:09:02Z South Africa v England: second Test, day one –... https://www.theguardian.com/sport/live/2016/ja... https://content.guardianapis.com/sport/live/20... {'headline': 'South Africa v England: second T... False pillar/sport Sport
9 football/live/2016/jan/02/west-ham-v-liverpool... liveblog football Football 2016-01-02T14:54:31Z West Ham United 2-0 Liverpool: Premier League ... https://www.theguardian.com/football/live/2016... https://content.guardianapis.com/football/live... {'headline': 'West Ham United 2-0 Liverpool: P... False pillar/sport Sport
10 world/2016/jan/02/tel-aviv-israel-bar-shooting... article world World news 2016-01-02T13:58:06Z Tel Aviv shooting: suspect 'wanted to avenge c... https://www.theguardian.com/world/2016/jan/02/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Tel Aviv shooting: suspect 'want... False pillar/news News
11 us-news/2016/jan/02/us-presidential-election-2... article us-news US news 2016-01-02T12:03:58Z US presidential election 2016: the state of th... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'US presidential election 2016: t... False pillar/news News
12 world/2016/jan/02/sheikh-nimr-al-nimr-shia-cle... article world World news 2016-01-02T12:03:15Z Sheikh Nimr al-Nimr: Shia cleric was a thorn i... https://www.theguardian.com/world/2016/jan/02/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Sheikh Nimr al-Nimr: Shia cleric... False pillar/news News
13 community/ng-interactive/2016/jan/02/your-phot... interactive community Community 2016-01-02T12:00:03Z Your photographs of 2015 – August https://www.theguardian.com/community/ng-inter... https://content.guardianapis.com/community/ng-... {'headline': 'Your photographs of 2015 – Augus... False pillar/news News
14 sport/live/2016/jan/02/melbourne-stars-vs-melb... liveblog sport Sport 2016-01-02T10:28:12Z Melbourne Stars vs Melbourne Renegades: Big Ba... https://www.theguardian.com/sport/live/2016/ja... https://content.guardianapis.com/sport/live/20... {'headline': 'Melbourne Stars vs Melbourne Ren... False pillar/sport Sport
15 world/2016/jan/02/saudi-execution-of-shia-cler... article world World news 2016-01-02T10:24:30Z Saudi execution of Shia cleric sparks outrage ... https://www.theguardian.com/world/2016/jan/02/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Saudi execution of Shia cleric s... False pillar/news News
16 tv-and-radio/2016/jan/02/afternoon-edition-dav... article tv-and-radio Television & radio 2016-01-02T09:00:06Z This week’s best new radio: Afternoon Edition https://www.theguardian.com/tv-and-radio/2016/... https://content.guardianapis.com/tv-and-radio/... {'headline': 'This week’s best new radio: Afte... False pillar/arts Arts
17 public-leaders-network/2016/jan/02/999-call-em... article public-leaders-network Public Leaders Network 2016-01-02T08:26:05Z You would never know I'm crying after I answer... https://www.theguardian.com/public-leaders-net... https://content.guardianapis.com/public-leader... {'headline': 'You would never know I'm crying ... False pillar/news News
18 small-business-network/2016/jan/02/small-busin... article small-business-network Guardian Small Business Network 2016-01-02T07:55:04Z Small business in the spotlight … HomeTouch https://www.theguardian.com/small-business-net... https://content.guardianapis.com/small-busines... {'headline': 'Small business in the spotlight ... False pillar/news News
19 music/2016/jan/02/david-bowie-profile-blacksta... article music Music 2016-01-02T07:00:03Z David Bowie: Back in the spotlight, still refu... https://www.theguardian.com/music/2016/jan/02/... https://content.guardianapis.com/music/2016/ja... {'headline': 'David Bowie: Back in the spotlig... False pillar/arts Arts
20 money/2016/jan/02/bye-buy-to-let-tax-changes-a... article money Money 2016-01-02T07:00:03Z Bye bye buy-to-let … but where next for your m... https://www.theguardian.com/money/2016/jan/02/... https://content.guardianapis.com/money/2016/ja... {'headline': 'Bye bye buy-to-let … but where n... False pillar/lifestyle Lifestyle
21 lifeandstyle/2016/jan/02/i-never-really-believ... article lifeandstyle Life and style 2016-01-02T06:15:02Z I never really believed Mum was dead https://www.theguardian.com/lifeandstyle/2016/... https://content.guardianapis.com/lifeandstyle/... {'headline': 'I never really believed Mum was ... False pillar/lifestyle Lifestyle
22 australia-news/2016/jan/02/immigration-officia... article australia-news Australia news 2016-01-02T06:00:05Z Immigration officials knew refugee Abyan still... https://www.theguardian.com/australia-news/201... https://content.guardianapis.com/australia-new... {'headline': 'Immigration officials knew refug... False pillar/news News
23 lifeandstyle/2016/jan/02/where-did-happily-eve... article lifeandstyle Life and style 2016-01-02T06:00:02Z Where did happily ever after go – and can you ... https://www.theguardian.com/lifeandstyle/2016/... https://content.guardianapis.com/lifeandstyle/... {'headline': 'Where did happily ever after go ... False pillar/lifestyle Lifestyle
24 politics/the-northerner/2016/jan/02/ludovic-ke... article politics Politics 2016-01-02T05:00:01Z Ludovic Kennedy chosen by Liberal party to con... https://www.theguardian.com/politics/the-north... https://content.guardianapis.com/politics/the-... {'headline': 'Ludovic Kennedy to stand as Libe... False pillar/news News
25 technology/2016/jan/01/facebook-truth-trump-obama article technology Technology 2016-01-02T01:25:42Z Is Facebook the enemy of truth and civic unity? https://www.theguardian.com/technology/2016/ja... https://content.guardianapis.com/technology/20... {'headline': 'Is Facebook the enemy of truth a... False pillar/news News
26 tv-and-radio/2016/jan/02/sherlocks-back-and-it... article tv-and-radio Television & radio 2016-01-02T00:13:39Z Sherlock’s back and it's fast, fun, flashy, f... https://www.theguardian.com/tv-and-radio/2016/... https://content.guardianapis.com/tv-and-radio/... {'headline': 'Sherlock’s back and it's fast, ... False pillar/arts Arts
In [17]:
frames = []

#iterate over json_files

for json in json_files:
    #read json into a dataframe called newsdata 
    newsdata = pd.read_json(json)
    
    #append newsdata to frames
    frames.append(newsdata)
    print('appending...')

#Concatenate frames into a single dataframe called news articles
newsarticles = pd.concat(frames)

#Print the shape of newsarticles
print(newsarticles.shape)
print('completed framing')
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
appending...
(67668, 12)
completed framing
In [18]:
newsarticles.head(50)
Out[18]:
id type sectionId sectionName webPublicationDate webTitle webUrl apiUrl fields isHosted pillarId pillarName
0 us-news/2016/jan/01/tamir-rice-protest-clevela... article us-news US news 2016-01-01T22:44:12Z Tamir Rice protesters picket house of Clevelan... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'Tamir Rice protesters picket hou... False pillar/news News
1 sport/2016/jan/01/itv-win-rights-racing-channel-4 article sport Sport 2016-01-01T22:18:11Z ITV wins terrestrial rights to show racing fro... https://www.theguardian.com/sport/2016/jan/01/... https://content.guardianapis.com/sport/2016/ja... {'headline': 'ITV wins terrestrial rights to s... False pillar/sport Sport
2 us-news/2016/jan/01/natalie-cole-singer-and-da... article us-news US news 2016-01-01T21:45:07Z Natalie Cole, singer and daughter of Nat King ... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'Natalie Cole, singer and daughte... False pillar/news News
3 us-news/2016/jan/01/carly-fiorina-rose-bowl-st... article us-news US news 2016-01-01T21:40:36Z Carly Fiorina tweets support for alma mater's ... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'Carly Fiorina tweets support for... False pillar/news News
4 world/2016/jan/01/slowdown-in-chinese-manufact... article business Business 2016-01-01T18:41:33Z Slowdown in Chinese manufacturing deepens fear... https://www.theguardian.com/world/2016/jan/01/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Slowdown in Chinese manufacturin... False pillar/news News
5 media/2016/jan/01/some-new-year-resolutions-fo... article media Media 2016-01-01T18:26:46Z Some New Year resolutions for the Guardian | L... https://www.theguardian.com/media/2016/jan/01/... https://content.guardianapis.com/media/2016/ja... {'headline': 'Some New Year resolutions for th... False pillar/news News
6 uk-news/2016/jan/01/damehood-for-professor-til... article uk-news UK news 2016-01-01T18:26:03Z Damehood for Professor Til Wykes is a recognit... https://www.theguardian.com/uk-news/2016/jan/0... https://content.guardianapis.com/uk-news/2016/... {'headline': 'Damehood for Professor Til Wykes... False pillar/news News
7 culture/2016/jan/01/shakespeare-to-lift-the-so... article culture Culture 2016-01-01T18:22:46Z Shakespeare to lift the soul and the emotions ... https://www.theguardian.com/culture/2016/jan/0... https://content.guardianapis.com/culture/2016/... {'headline': 'Shakespeare to lift the soul and... False pillar/arts Arts
8 guardian-observer-style-guide-f article info Info 2016-01-01T17:47:00Z Guardian and Observer style guide: F https://www.theguardian.com/guardian-observer-... https://content.guardianapis.com/guardian-obse... {'headline': 'Guardian and Observer style guid... False pillar/news News
9 commentisfree/2016/jan/01/the-guardian-view-on... article commentisfree Opinion 2016-01-01T17:41:46Z The Guardian view on statistical misperception... https://www.theguardian.com/commentisfree/2016... https://content.guardianapis.com/commentisfree... {'headline': 'The Guardian view on statistical... False pillar/opinion Opinion
10 us-news/2016/jan/01/texas-open-carry-handguns-... article us-news US news 2016-01-01T15:46:25Z Open carry of handguns in Texas: fear for some... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'Open carry of handguns in Texas:... False pillar/news News
11 world/2016/jan/01/dubai-skyscraper-fire-briton... article world World news 2016-01-01T13:57:10Z Dubai skyscraper fire: Briton tells of rescuin... https://www.theguardian.com/world/2016/jan/01/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Dubai skyscraper fire: Briton te... False pillar/news News
12 world/2016/jan/01/pegida-leader-lutz-bachmann-... article world World news 2016-01-01T13:34:25Z Pegida leader criticised for linking Munich te... https://www.theguardian.com/world/2016/jan/01/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Pegida leader criticised for lin... False pillar/news News
13 world/2016/jan/01/turkish-president-recep-tayy... article world World news 2016-01-01T13:27:15Z ErdoÄŸan cites Hitler's Germany as example of e... https://www.theguardian.com/world/2016/jan/01/... https://content.guardianapis.com/world/2016/ja... {'headline': 'ErdoÄŸan cites Hitler's Germany a... False pillar/news News
14 us-news/2016/jan/01/us-presidential-election-2... article us-news US news 2016-01-01T13:00:16Z US presidential election 2016: the state of th... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'US presidential election 2016: t... False pillar/news News
15 world/2016/jan/01/amsterdam-schiphol-airport-e... article world World news 2016-01-01T12:26:24Z Amsterdam airport check-in evacuated as Britis... https://www.theguardian.com/world/2016/jan/01/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Amsterdam airport check-in evacu... False pillar/news News
16 sustainable-business/2016/jan/01/climate-chang... article sustainable-business Guardian Sustainable Business 2016-01-01T12:00:10Z How to help your company prepare for climate c... https://www.theguardian.com/sustainable-busine... https://content.guardianapis.com/sustainable-b... {'headline': 'How to help your company prepare... False NaN NaN
17 community/ng-interactive/2016/jan/01/your-phot... interactive community Community 2016-01-01T12:00:10Z Your photographs of 2015 - July https://www.theguardian.com/community/ng-inter... https://content.guardianapis.com/community/ng-... {'headline': 'Your photographs of 2015 - July'... False pillar/news News
18 world/2015/dec/31/munich-police-warn-of-immine... article world World news 2016-01-01T11:29:15Z Munich suicide bomb plot: police hunt Isis sus... https://www.theguardian.com/world/2015/dec/31/... https://content.guardianapis.com/world/2015/de... {'headline': 'Munich suicide bomb plot: police... False pillar/news News
19 culture/2016/jan/01/martin-scorsese-vinyl-zool... article culture Culture 2016-01-01T10:00:00Z Martin Scorsese’s Vinyl, Zoolander 2 and Drake... https://www.theguardian.com/culture/2016/jan/0... https://content.guardianapis.com/culture/2016/... {'headline': 'Martin Scorsese’s Vinyl, Zooland... False pillar/arts Arts
20 film/filmblog/2016/jan/01/robert-de-niro-five-... article film Film 2016-01-01T10:00:00Z Robert De Niro: five best moments https://www.theguardian.com/film/filmblog/2016... https://content.guardianapis.com/film/filmblog... {'headline': 'Robert De Niro: five best moment... False pillar/arts Arts
21 culture/2016/jan/01/shakespeares-400th-anniver... article culture Culture 2016-01-01T10:00:00Z Shakespeare's 400th anniversary: 'man of Strat... https://www.theguardian.com/culture/2016/jan/0... https://content.guardianapis.com/culture/2016/... {'headline': 'Shakespeare's 400th anniversary:... False pillar/arts Arts
22 world/2016/jan/01/putin-portraits-2015-simpson... article world World news 2016-01-01T09:30:28Z A year in Putin portraits, from mocking memes ... https://www.theguardian.com/world/2016/jan/01/... https://content.guardianapis.com/world/2016/ja... {'headline': 'A year in Putin portraits, from ... False pillar/news News
23 housing-network/2016/jan/01/britains-housing-c... article housing-network Housing Network 2016-01-01T09:12:27Z Five steps to fixing the UK housing crisis in ... https://www.theguardian.com/housing-network/20... https://content.guardianapis.com/housing-netwo... {'headline': 'Five steps to fixing the UK hous... False pillar/news News
24 books/2016/jan/01/books-literary-calendar-2016 article books Books 2016-01-01T08:00:26Z Books in 2016: a literary calendar https://www.theguardian.com/books/2016/jan/01/... https://content.guardianapis.com/books/2016/ja... {'headline': 'Books in 2016: a literary calend... False pillar/arts Arts
25 commentisfree/2016/jan/01/lemmy-gambled-won-mo... article commentisfree Opinion 2016-01-01T08:00:26Z Lemmy gambled and won – but kids, don’t try th... https://www.theguardian.com/commentisfree/2016... https://content.guardianapis.com/commentisfree... {'headline': 'Lemmy gambled and won – but kids... False pillar/opinion Opinion
26 news/2016/jan/01/the-guardian-biggest-stories-... article news News 2016-01-01T08:00:26Z The Guardian's biggest stories of 2015 https://www.theguardian.com/news/2016/jan/01/t... https://content.guardianapis.com/news/2016/jan... {'headline': 'The Guardian's biggest stories o... False pillar/news News
27 tv-and-radio/2016/jan/01/panda-babies-coronati... article tv-and-radio Television & radio 2016-01-01T07:30:25Z Panda Babies review – bamboozled by tummy tick... https://www.theguardian.com/tv-and-radio/2016/... https://content.guardianapis.com/tv-and-radio/... {'headline': 'Panda Babies review – bamboozled... False pillar/arts Arts
28 environment/2016/jan/01/northern-territory-rem... article environment Environment 2016-01-01T07:29:43Z Northern Territory removes 290 saltwater croco... https://www.theguardian.com/environment/2016/j... https://content.guardianapis.com/environment/2... {'headline': 'Northern Territory removes 290 s... False pillar/news News
29 global-development/2016/jan/01/access-to-justi... article global-development Global development 2016-01-01T07:00:25Z Access to justice for all? Now that would be a... https://www.theguardian.com/global-development... https://content.guardianapis.com/global-develo... {'headline': 'Access to justice for all? Now t... False pillar/news News
30 world/live/2015/dec/31/new-years-eve-celebrati... liveblog world World news 2016-01-01T05:55:46Z Munich police warned of attack by 'five to sev... https://www.theguardian.com/world/live/2015/de... https://content.guardianapis.com/world/live/20... {'headline': 'Munich police warned of attack b... False pillar/news News
31 us-news/2015/dec/31/hillary-clinton-emails-ang... article us-news US news 2016-01-01T04:36:07Z Hillary Clinton was told Angela Merkel is agai... https://www.theguardian.com/us-news/2015/dec/3... https://content.guardianapis.com/us-news/2015/... {'headline': 'Hillary Clinton was told Angela ... False pillar/news News
32 media/2016/jan/01/brian-johns-a-great-australi... article media Media 2016-01-01T01:22:12Z Former ABC managing director Brian Johns dies ... https://www.theguardian.com/media/2016/jan/01/... https://content.guardianapis.com/media/2016/ja... {'headline': 'Former ABC managing director Bri... False pillar/news News
33 world/2016/jan/01/novel-about-jewish-palestini... article world World news 2016-01-01T01:04:07Z Novel about Jewish-Palestinian love affair is ... https://www.theguardian.com/world/2016/jan/01/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Novel about Jewish-Palestinian l... False pillar/news News
34 uk-news/2015/dec/31/new-year-celebrations-uk-s... article uk-news UK news 2016-01-01T00:11:22Z UK new year events go ahead amid heightened se... https://www.theguardian.com/uk-news/2015/dec/3... https://content.guardianapis.com/uk-news/2015/... {'headline': 'UK new year events go ahead amid... False pillar/news News
0 artanddesign/2016/jan/02/your-pictures-share-y... article artanddesign Art and design 2016-01-02T22:35:19Z Your pictures: share your photos on the theme ... https://www.theguardian.com/artanddesign/2016/... https://content.guardianapis.com/artanddesign/... {'headline': 'Your pictures: share your photos... False pillar/arts Arts
1 football/live/2016/jan/02/watford-v-manchester... liveblog football Football 2016-01-02T19:25:27Z Watford v Manchester City: Premier League – as... https://www.theguardian.com/football/live/2016... https://content.guardianapis.com/football/live... {'headline': 'Watford v Manchester City: Premi... False pillar/sport Sport
2 us-news/2016/jan/02/new-jersey-newspaper-edito... article us-news US news 2016-01-02T18:58:22Z Two New Jersey newspapers denounce Chris Chris... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'Two New Jersey newspapers denoun... False pillar/news News
3 football/2016/jan/02/steve-gohouri-found-dead-... article football Football 2016-01-02T18:41:38Z Former Ivory Coast defender Steve Gohouri foun... https://www.theguardian.com/football/2016/jan/... https://content.guardianapis.com/football/2016... {'headline': 'Former Ivory Coast defender Stev... False pillar/sport Sport
4 football/blog/2016/jan/02/anthony-martial-wayn... article football Football 2016-01-02T18:14:36Z Anthony Martial and Wayne Rooney prove Manches... https://www.theguardian.com/football/blog/2016... https://content.guardianapis.com/football/blog... {'headline': 'Anthony Martial and Wayne Rooney... False pillar/sport Sport
5 football/live/2016/jan/02/manchester-united-v-... liveblog football Football 2016-01-02T17:40:17Z Manchester United 2-1 Swansea, Espanyol 0-0 Ba... https://www.theguardian.com/football/live/2016... https://content.guardianapis.com/football/live... {'headline': 'Manchester United 2-1 Swansea, E... False pillar/sport Sport
6 world/live/2016/jan/02/middle-east-condemns-sa... liveblog world World news 2016-01-02T16:55:37Z Execution of Shia cleric sparks international ... https://www.theguardian.com/world/live/2016/ja... https://content.guardianapis.com/world/live/20... {'headline': 'Execution of Shia cleric sparks ... False pillar/news News
7 sport/2016/jan/02/lizzie-kelly-tea-for-two-box... article sport Sport 2016-01-02T16:50:08Z Lizzie Kelly: The only person who really belie... https://www.theguardian.com/sport/2016/jan/02/... https://content.guardianapis.com/sport/2016/ja... {'headline': 'Lizzie Kelly: The only person wh... False pillar/sport Sport
8 sport/live/2016/jan/02/south-africa-v-england-... liveblog sport Sport 2016-01-02T16:09:02Z South Africa v England: second Test, day one –... https://www.theguardian.com/sport/live/2016/ja... https://content.guardianapis.com/sport/live/20... {'headline': 'South Africa v England: second T... False pillar/sport Sport
9 football/live/2016/jan/02/west-ham-v-liverpool... liveblog football Football 2016-01-02T14:54:31Z West Ham United 2-0 Liverpool: Premier League ... https://www.theguardian.com/football/live/2016... https://content.guardianapis.com/football/live... {'headline': 'West Ham United 2-0 Liverpool: P... False pillar/sport Sport
10 world/2016/jan/02/tel-aviv-israel-bar-shooting... article world World news 2016-01-02T13:58:06Z Tel Aviv shooting: suspect 'wanted to avenge c... https://www.theguardian.com/world/2016/jan/02/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Tel Aviv shooting: suspect 'want... False pillar/news News
11 us-news/2016/jan/02/us-presidential-election-2... article us-news US news 2016-01-02T12:03:58Z US presidential election 2016: the state of th... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'US presidential election 2016: t... False pillar/news News
12 world/2016/jan/02/sheikh-nimr-al-nimr-shia-cle... article world World news 2016-01-02T12:03:15Z Sheikh Nimr al-Nimr: Shia cleric was a thorn i... https://www.theguardian.com/world/2016/jan/02/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Sheikh Nimr al-Nimr: Shia cleric... False pillar/news News
13 community/ng-interactive/2016/jan/02/your-phot... interactive community Community 2016-01-02T12:00:03Z Your photographs of 2015 – August https://www.theguardian.com/community/ng-inter... https://content.guardianapis.com/community/ng-... {'headline': 'Your photographs of 2015 – Augus... False pillar/news News
14 sport/live/2016/jan/02/melbourne-stars-vs-melb... liveblog sport Sport 2016-01-02T10:28:12Z Melbourne Stars vs Melbourne Renegades: Big Ba... https://www.theguardian.com/sport/live/2016/ja... https://content.guardianapis.com/sport/live/20... {'headline': 'Melbourne Stars vs Melbourne Ren... False pillar/sport Sport
In [19]:
# Check if there are missing values in the dataset
newsarticles['id'].isna().sum()
Out[19]:
0
In [20]:
newsarticles.shape[0]

print('There are',newsarticles.shape[0],'rows and',newsarticles.shape[1],'columns')
There are 67668 rows and 12 columns

Processing Data

Cleaning Data

News Dataset

In [21]:
newsarticles['pillarName'].unique()
Out[21]:
array(['News', 'Sport', 'Arts', 'Opinion', nan, 'Lifestyle'], dtype=object)
In [22]:
#Only keep values where the pillarName = 'News'
newsarticles2 = newsarticles[newsarticles['pillarName'] == 'News']
newsarticles2.shape
Out[22]:
(40320, 12)
In [23]:
newsarticles2.head()
Out[23]:
id type sectionId sectionName webPublicationDate webTitle webUrl apiUrl fields isHosted pillarId pillarName
0 us-news/2016/jan/01/tamir-rice-protest-clevela... article us-news US news 2016-01-01T22:44:12Z Tamir Rice protesters picket house of Clevelan... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'Tamir Rice protesters picket hou... False pillar/news News
2 us-news/2016/jan/01/natalie-cole-singer-and-da... article us-news US news 2016-01-01T21:45:07Z Natalie Cole, singer and daughter of Nat King ... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'Natalie Cole, singer and daughte... False pillar/news News
3 us-news/2016/jan/01/carly-fiorina-rose-bowl-st... article us-news US news 2016-01-01T21:40:36Z Carly Fiorina tweets support for alma mater's ... https://www.theguardian.com/us-news/2016/jan/0... https://content.guardianapis.com/us-news/2016/... {'headline': 'Carly Fiorina tweets support for... False pillar/news News
4 world/2016/jan/01/slowdown-in-chinese-manufact... article business Business 2016-01-01T18:41:33Z Slowdown in Chinese manufacturing deepens fear... https://www.theguardian.com/world/2016/jan/01/... https://content.guardianapis.com/world/2016/ja... {'headline': 'Slowdown in Chinese manufacturin... False pillar/news News
5 media/2016/jan/01/some-new-year-resolutions-fo... article media Media 2016-01-01T18:26:46Z Some New Year resolutions for the Guardian | L... https://www.theguardian.com/media/2016/jan/01/... https://content.guardianapis.com/media/2016/ja... {'headline': 'Some New Year resolutions for th... False pillar/news News
In [24]:
newsarticles2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40320 entries, 0 to 37
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  40320 non-null  object
 1   type                40320 non-null  object
 2   sectionId           40320 non-null  object
 3   sectionName         40320 non-null  object
 4   webPublicationDate  40320 non-null  object
 5   webTitle            40320 non-null  object
 6   webUrl              40320 non-null  object
 7   apiUrl              40320 non-null  object
 8   fields              40320 non-null  object
 9   isHosted            40320 non-null  bool  
 10  pillarId            40320 non-null  object
 11  pillarName          40320 non-null  object
dtypes: bool(1), object(11)
memory usage: 3.7+ MB
In [25]:
#Create a new dataframe which has the web publication date only
publicationdate = newsarticles2[['webPublicationDate']]
publicationdate.reset_index(drop = True, inplace = True)
publicationdate.head()
Out[25]:
webPublicationDate
0 2016-01-01T22:44:12Z
1 2016-01-01T21:45:07Z
2 2016-01-01T21:40:36Z
3 2016-01-01T18:41:33Z
4 2016-01-01T18:26:46Z
In [26]:
#Column fields has the relevant information required for this analysis
newsarticlesdf = newsarticles2['fields'].apply(pd.Series)
print(newsarticlesdf.shape)
(40320, 36)
In [27]:
#Reset index 
newsarticlesdf.reset_index(drop = True, inplace = True)
newsarticlesdf.head()
Out[27]:
headline standfirst trailText byline main body wordcount commentCloseDate commentable firstPublicationDate ... isLive displayHint liveBloggingNow allowUgc contributorBio showAffiliateLinks sensitive membershipAccess starRating shouldHideReaderRevenue
0 Tamir Rice protesters picket house of Clevelan... <ul><li>Crowd marks time before boy shot by po... Crowd marks time before boy shot by police off... Afi Scruggs in Cleveland <figure class="element element-image" data-med... <p>Anger over the decision <a href="http://www... 627 2016-01-04T22:45:00Z true 2016-01-01T22:44:12Z ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 Natalie Cole, singer and daughter of Nat King ... <p>Singer’s family pay tribute to ‘fierce and ... Singer’s family pay tribute to ‘fierce and cou... Matthew Taylor, Martin Pengelly and agencies <figure class="element element-video" data-can... <p><a href="http://www.theguardian.com/culture... 728 2016-01-04T17:15:00Z true 2016-01-01T17:17:40Z ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 Carly Fiorina tweets support for alma mater's ... <p>‘Love my alma mater,’ said the former Hewle... ‘Love my alma mater,’ said the former Hewlett-... Martin Pengelly in New York <figure class="element element-image" data-med... <p>The Republican presidential candidate Carly... 314 2016-01-04T20:45:00Z true 2016-01-01T20:52:04Z ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 Slowdown in Chinese manufacturing deepens fear... <p>Factory activity cools for fifth month runn... Factory activity cools for fifth month running... Katie Allen <figure class="element element-image" data-med... <p>A further slowdown in China’s vast manufact... 687 2016-01-04T18:45:00Z true 2016-01-01T18:41:33Z ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 Some New Year resolutions for the Guardian NaN <strong>Letters:</strong> There’s growing evid... Letters <figure class="element element-image" data-med... <p>It’s inspiring to read in the <a href="http... 952 NaN false 2016-01-01T18:26:46Z ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 36 columns

In [28]:
newsarticlesdf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40320 entries, 0 to 40319
Data columns (total 36 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   headline                       40320 non-null  object
 1   standfirst                     40004 non-null  object
 2   trailText                      40320 non-null  object
 3   byline                         39411 non-null  object
 4   main                           40320 non-null  object
 5   body                           40320 non-null  object
 6   wordcount                      40320 non-null  object
 7   commentCloseDate               14911 non-null  object
 8   commentable                    17013 non-null  object
 9   firstPublicationDate           40316 non-null  object
 10  isInappropriateForSponsorship  40145 non-null  object
 11  isPremoderated                 40320 non-null  object
 12  lastModified                   40320 non-null  object
 13  productionOffice               40320 non-null  object
 14  publication                    40320 non-null  object
 15  shortUrl                       40320 non-null  object
 16  shouldHideAdverts              40320 non-null  object
 17  showInRelatedContent           40320 non-null  object
 18  thumbnail                      39833 non-null  object
 19  legallySensitive               40228 non-null  object
 20  lang                           40320 non-null  object
 21  bodyText                       40320 non-null  object
 22  charCount                      40320 non-null  object
 23  bylineHtml                     39411 non-null  object
 24  newspaperPageNumber            14677 non-null  object
 25  newspaperEditionDate           14746 non-null  object
 26  isLive                         8345 non-null   object
 27  displayHint                    1854 non-null   object
 28  liveBloggingNow                3575 non-null   object
 29  allowUgc                       437 non-null    object
 30  contributorBio                 522 non-null    object
 31  showAffiliateLinks             13481 non-null  object
 32  sensitive                      7517 non-null   object
 33  membershipAccess               40 non-null     object
 34  starRating                     26 non-null     object
 35  shouldHideReaderRevenue        20591 non-null  object
dtypes: object(36)
memory usage: 11.1+ MB
In [29]:
#Merge the publicationdate df with the newsarticles df to obtain the publication date of the article. 
newsarticlesdate = newsarticlesdf.merge(publicationdate,left_index = True, right_index = True)
newsarticlesdate.head()
Out[29]:
headline standfirst trailText byline main body wordcount commentCloseDate commentable firstPublicationDate ... displayHint liveBloggingNow allowUgc contributorBio showAffiliateLinks sensitive membershipAccess starRating shouldHideReaderRevenue webPublicationDate
0 Tamir Rice protesters picket house of Clevelan... <ul><li>Crowd marks time before boy shot by po... Crowd marks time before boy shot by police off... Afi Scruggs in Cleveland <figure class="element element-image" data-med... <p>Anger over the decision <a href="http://www... 627 2016-01-04T22:45:00Z true 2016-01-01T22:44:12Z ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2016-01-01T22:44:12Z
1 Natalie Cole, singer and daughter of Nat King ... <p>Singer’s family pay tribute to ‘fierce and ... Singer’s family pay tribute to ‘fierce and cou... Matthew Taylor, Martin Pengelly and agencies <figure class="element element-video" data-can... <p><a href="http://www.theguardian.com/culture... 728 2016-01-04T17:15:00Z true 2016-01-01T17:17:40Z ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2016-01-01T21:45:07Z
2 Carly Fiorina tweets support for alma mater's ... <p>‘Love my alma mater,’ said the former Hewle... ‘Love my alma mater,’ said the former Hewlett-... Martin Pengelly in New York <figure class="element element-image" data-med... <p>The Republican presidential candidate Carly... 314 2016-01-04T20:45:00Z true 2016-01-01T20:52:04Z ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2016-01-01T21:40:36Z
3 Slowdown in Chinese manufacturing deepens fear... <p>Factory activity cools for fifth month runn... Factory activity cools for fifth month running... Katie Allen <figure class="element element-image" data-med... <p>A further slowdown in China’s vast manufact... 687 2016-01-04T18:45:00Z true 2016-01-01T18:41:33Z ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2016-01-01T18:41:33Z
4 Some New Year resolutions for the Guardian NaN <strong>Letters:</strong> There’s growing evid... Letters <figure class="element element-image" data-med... <p>It’s inspiring to read in the <a href="http... 952 NaN false 2016-01-01T18:26:46Z ... NaN NaN NaN NaN NaN NaN NaN NaN NaN 2016-01-01T18:26:46Z

5 rows × 37 columns

In [30]:
#Keep necessary columns for this investigation
newsarticlesdf2 = newsarticlesdate[['headline','body','charCount','wordcount','lastModified','publication','webPublicationDate']]
newsarticlesdf2.head()
Out[30]:
headline body charCount wordcount lastModified publication webPublicationDate
0 Tamir Rice protesters picket house of Clevelan... <p>Anger over the decision <a href="http://www... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z
1 Natalie Cole, singer and daughter of Nat King ... <p><a href="http://www.theguardian.com/culture... 4102 728 2017-11-29T04:58:24Z The Guardian 2016-01-01T21:45:07Z
2 Carly Fiorina tweets support for alma mater's ... <p>The Republican presidential candidate Carly... 1980 314 2017-07-14T20:17:16Z theguardian.com 2016-01-01T21:40:36Z
3 Slowdown in Chinese manufacturing deepens fear... <p>A further slowdown in China’s vast manufact... 4128 687 2017-11-29T04:58:43Z The Guardian 2016-01-01T18:41:33Z
4 Some New Year resolutions for the Guardian <p>It’s inspiring to read in the <a href="http... 5700 952 2017-11-29T04:58:51Z The Guardian 2016-01-01T18:26:46Z

Comments
Web publication date has been taken from the original dataset and merged to get the original date of publication. The webPublicationdate is present for all rows.

In [31]:
#Convert lastmodified date to date time
newsarticlesdf2['PublicationDate'] = pd.to_datetime(newsarticlesdf2['webPublicationDate'],errors = 'coerce')
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [32]:
#Check for NA
newsarticlesdf2.isna().sum()
Out[32]:
headline              0
body                  0
charCount             0
wordcount             0
lastModified          0
publication           0
webPublicationDate    0
PublicationDate       0
dtype: int64

Stock Indices Dataset

In [33]:
#Find the type of each of the columns
ftse.dtypes
Out[33]:
Date      object
Price     object
Open      object
High      object
Low       object
Volume    object
Chg%      object
dtype: object
In [34]:
SP500.dtypes
Out[34]:
Date         datetime64[ns]
High                float64
Low                 float64
Open                float64
Close               float64
Volume                int64
Adj Close           float64
dtype: object
In [35]:
#Drop columns that are not needed in the analysis
ftse.drop(labels = ['High','Open','Low','Volume','Chg%'], axis = 'columns', inplace = True)
SP500.drop(labels = ['High','Low','Open','Volume','Adj Close'], axis = 'columns', inplace = True)
In [36]:
ftse.head()
Out[36]:
Date Price
0 Dec 31, 2019 7,542.44
1 Dec 30, 2019 7,587.05
2 Dec 27, 2019 7,644.90
3 Dec 24, 2019 7,632.24
4 Dec 23, 2019 7,623.59
In [37]:
SP500.head()
Out[37]:
Date Close
0 2015-12-31 2043.939941
1 2016-01-04 2012.660034
2 2016-01-05 2016.709961
3 2016-01-06 1990.260010
4 2016-01-07 1943.089966
In [38]:
#Rename Price to Close to match the SP 500 dataset
ftse.columns = ['Date','Close']
In [39]:
#The close price on the FTSE dataset included commas so use replace the commas with '' and then conver to float64 as you can't convert it otherwise
ftse['Date'] = pd.to_datetime(ftse['Date'])
ftse['Close'] = ftse['Close'].replace(',','',regex=True).astype(np.float64)
ftse.head()
Out[39]:
Date Close
0 2019-12-31 7542.44
1 2019-12-30 7587.05
2 2019-12-27 7644.90
3 2019-12-24 7632.24
4 2019-12-23 7623.59
In [40]:
#Find the missing dates from the dataset
ftsedates = pd.date_range(start = ftse.Date.min(), end = ftse.Date.max())
ftseNew = ftse.set_index('Date').reindex(ftsedates).rename_axis('Date').reset_index()
In [41]:
ftseNew.head()
Out[41]:
Date Close
0 2016-01-04 6093.43
1 2016-01-05 6137.24
2 2016-01-06 6073.38
3 2016-01-07 5954.08
4 2016-01-08 5912.44
In [42]:
#Fill the missing dates which have NA values with the previous date. 
ftseNew['Close'].fillna(method='ffill', inplace=True)
In [43]:
ftseNew.head()
Out[43]:
Date Close
0 2016-01-04 6093.43
1 2016-01-05 6137.24
2 2016-01-06 6073.38
3 2016-01-07 5954.08
4 2016-01-08 5912.44
In [44]:
SP500.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    1007 non-null   datetime64[ns]
 1   Close   1007 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 15.9 KB
In [45]:
#Find the missing dates from the dataset
SP500dates = pd.date_range(start = SP500.Date.min(), end = SP500.Date.max())
SP500New = SP500.set_index('Date').reindex(SP500dates).rename_axis('Date').reset_index()
In [46]:
SP500New.head()
Out[46]:
Date Close
0 2015-12-31 2043.939941
1 2016-01-01 NaN
2 2016-01-02 NaN
3 2016-01-03 NaN
4 2016-01-04 2012.660034
In [47]:
#Fill the missing dates which have NA values with the previous date. 
SP500New['Close'].fillna(method='ffill', inplace=True)
In [48]:
SP500New.head()
Out[48]:
Date Close
0 2015-12-31 2043.939941
1 2016-01-01 2043.939941
2 2016-01-02 2043.939941
3 2016-01-03 2043.939941
4 2016-01-04 2012.660034
In [49]:
#To do time series analysis, set the index to be the date column
ftseTime = ftseNew.set_index('Date')
ftseTime.head()
Out[49]:
Close
Date
2016-01-04 6093.43
2016-01-05 6137.24
2016-01-06 6073.38
2016-01-07 5954.08
2016-01-08 5912.44
In [50]:
SP500Time = SP500New.set_index('Date')
SP500Time.head()
Out[50]:
Close
Date
2015-12-31 2043.939941
2016-01-01 2043.939941
2016-01-02 2043.939941
2016-01-03 2043.939941
2016-01-04 2012.660034

Exchange Rate dataset

In [51]:
#Drop columns Open, High and Low columns as we are only interested with the price on the day
FXUSD.drop(labels = ['Open','High','Low'], axis = 'columns', inplace = True)
FXUSD.head()
Out[51]:
Date Price Volume Chg%
0 Dec 31, 2019 1.3261 97.86K 1.11%
1 Dec 30, 2019 1.3115 100.82K 0.29%
2 Dec 27, 2019 1.3077 85.56K 0.65%
3 Dec 26, 2019 1.2993 99.65K 0.25%
4 Dec 25, 2019 1.2961 34.43K 0.12%
In [52]:
FXUSD['Date'] = pd.to_datetime(FXUSD['Date'])
In [53]:
FXUSD.head()
Out[53]:
Date Price Volume Chg%
0 2019-12-31 1.3261 97.86K 1.11%
1 2019-12-30 1.3115 100.82K 0.29%
2 2019-12-27 1.3077 85.56K 0.65%
3 2019-12-26 1.2993 99.65K 0.25%
4 2019-12-25 1.2961 34.43K 0.12%
In [54]:
FXUSDdates = pd.date_range(start = FXUSD.Date.min(), end = FXUSD.Date.max())
FXUSDNew = FXUSD.set_index('Date').reindex(FXUSDdates).rename_axis('Date').reset_index()
FXUSDNew['Price'].fillna(method='ffill', inplace=True)
FXUSDTime = FXUSDNew.set_index('Date')
In [55]:
FXUSDTime.head()
Out[55]:
Price Volume Chg%
Date
2016-01-01 1.4748 0 0.06%
2016-01-02 1.4748 NaN NaN
2016-01-03 1.4748 NaN NaN
2016-01-04 1.4718 0 -0.20%
2016-01-05 1.4672 0 -0.31%

Comments

The indicies do not move on weekends or public holidays in the respective country as stock markets are closed. These dates are not included within the data, hence we will find these dates and keep the price the same as the previous days. This applies for the FX rates where the FX markets are generally closed on Christmas Day and New Years day.

Initial Analysis

Stock Indices Dataset

In [56]:
# Plot the closing prices for FTSE 100
ftseTime['Close'].plot(grid = True)

plt.ylabel('Close Price')
plt.title('FTSE 100 Close Price')
plt.show()
In [57]:
# Plot the closing prices for DJIA
SP500Time['Close'].plot(grid = True)

plt.ylabel('Close Price')
plt.title('S&P 500 Close Price')
plt.show()
In [58]:
#Plot both FTSE 100 and S&P500 on the same graph
ftseTime['Close'].plot(grid = True, label = "FTSE 100")
SP500Time['Close'].plot(grid = True , label = "S&P 500")

plt.ylabel('Close Price')
plt.title('FTSE 100 & S&P 500')
plt.legend()
plt.show()
In [59]:
print("FTSE 100 reached an all-time high on", ftseNew.max())
FTSE 100 reached an all-time high on Date     2019-12-31 00:00:00
Close                7877.45
dtype: object
In [60]:
print("FTSE 100 reached an all-time low on",ftseNew.min())
FTSE 100 reached an all-time low on Date     2016-01-04 00:00:00
Close                5536.97
dtype: object
In [61]:
print("S&P 500 reached an all-time high on", SP500New.max())
S&P 500 reached an all-time high on Date     2019-12-31 00:00:00
Close                3240.02
dtype: object
In [62]:
print("S&P 500 reached an all-time low on", SP500New.min())
S&P 500 reached an all-time low on Date     2015-12-31 00:00:00
Close                1829.08
dtype: object

FX Dataset

In [63]:
# Plot the closing prices for DJIA
FXUSDTime['Price'].plot(grid = True)

plt.ylabel('Price')
plt.title('USD/GBP')
plt.show()
In [64]:
print("USD/GBP exchange reached an all-time high on", FXUSDNew.max())
USD/GBP exchange reached an all-time high on Date     2019-12-31 00:00:00
Price                 1.4879
dtype: object
In [65]:
print("USD/GBP exchange reached an all-time low on", FXUSDNew.min())
USD/GBP exchange reached an all-time low on Date     2016-01-01 00:00:00
Price                 1.2037
dtype: object

Feature Engineering

News Dataset

Features to be applied to the news dataset:

  • Number of words in a headline
  • Number of characters in a headline
  • Day when article is published
  • Month when article is published
  • Year when article is published
  • Sentiment of headline: Positive/Negative/Neutral
  • LDA topic analysis
In [66]:
newsarticlesdf2['PublishDate'] = [d.date() for d in newsarticlesdf2['PublicationDate']]
newsarticlesdf2['PublishTime'] = [d.time() for d in newsarticlesdf2['PublicationDate']]
newsarticlesdf2.head()
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
Out[66]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime
0 Tamir Rice protesters picket house of Clevelan... <p>Anger over the decision <a href="http://www... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12+00:00 2016-01-01 22:44:12
1 Natalie Cole, singer and daughter of Nat King ... <p><a href="http://www.theguardian.com/culture... 4102 728 2017-11-29T04:58:24Z The Guardian 2016-01-01T21:45:07Z 2016-01-01 21:45:07+00:00 2016-01-01 21:45:07
2 Carly Fiorina tweets support for alma mater's ... <p>The Republican presidential candidate Carly... 1980 314 2017-07-14T20:17:16Z theguardian.com 2016-01-01T21:40:36Z 2016-01-01 21:40:36+00:00 2016-01-01 21:40:36
3 Slowdown in Chinese manufacturing deepens fear... <p>A further slowdown in China’s vast manufact... 4128 687 2017-11-29T04:58:43Z The Guardian 2016-01-01T18:41:33Z 2016-01-01 18:41:33+00:00 2016-01-01 18:41:33
4 Some New Year resolutions for the Guardian <p>It’s inspiring to read in the <a href="http... 5700 952 2017-11-29T04:58:51Z The Guardian 2016-01-01T18:26:46Z 2016-01-01 18:26:46+00:00 2016-01-01 18:26:46
In [67]:
#Find day of week from the article
newsarticlesdf2['PublishDate'] = pd.to_datetime(newsarticlesdf2['PublishDate'],errors = 'coerce')
newsarticlesdf2['month'] = newsarticlesdf2['PublishDate'].dt.month
newsarticlesdf2['Month full'] = newsarticlesdf2['PublishDate'].dt.strftime('%B')
newsarticlesdf2['year'] = newsarticlesdf2['PublishDate'].dt.year
newsarticlesdf2.head()
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
Out[67]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime month Month full year
0 Tamir Rice protesters picket house of Clevelan... <p>Anger over the decision <a href="http://www... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12+00:00 2016-01-01 22:44:12 1 January 2016
1 Natalie Cole, singer and daughter of Nat King ... <p><a href="http://www.theguardian.com/culture... 4102 728 2017-11-29T04:58:24Z The Guardian 2016-01-01T21:45:07Z 2016-01-01 21:45:07+00:00 2016-01-01 21:45:07 1 January 2016
2 Carly Fiorina tweets support for alma mater's ... <p>The Republican presidential candidate Carly... 1980 314 2017-07-14T20:17:16Z theguardian.com 2016-01-01T21:40:36Z 2016-01-01 21:40:36+00:00 2016-01-01 21:40:36 1 January 2016
3 Slowdown in Chinese manufacturing deepens fear... <p>A further slowdown in China’s vast manufact... 4128 687 2017-11-29T04:58:43Z The Guardian 2016-01-01T18:41:33Z 2016-01-01 18:41:33+00:00 2016-01-01 18:41:33 1 January 2016
4 Some New Year resolutions for the Guardian <p>It’s inspiring to read in the <a href="http... 5700 952 2017-11-29T04:58:51Z The Guardian 2016-01-01T18:26:46Z 2016-01-01 18:26:46+00:00 2016-01-01 18:26:46 1 January 2016
In [68]:
newsarticlesdf2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40320 entries, 0 to 40319
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   headline            40320 non-null  object             
 1   body                40320 non-null  object             
 2   charCount           40320 non-null  object             
 3   wordcount           40320 non-null  object             
 4   lastModified        40320 non-null  object             
 5   publication         40320 non-null  object             
 6   webPublicationDate  40320 non-null  object             
 7   PublicationDate     40320 non-null  datetime64[ns, UTC]
 8   PublishDate         40320 non-null  datetime64[ns]     
 9   PublishTime         40320 non-null  object             
 10  month               40320 non-null  int64              
 11  Month full          40320 non-null  object             
 12  year                40320 non-null  int64              
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), int64(2), object(9)
memory usage: 4.0+ MB
In [69]:
plt.figure(figsize=(10,6))
sns.countplot(newsarticlesdf2['month'])
plt.title('Count of number of headlines published per year')
Out[69]:
Text(0.5,1,'Count of number of headlines published per year')
In [ ]:
 
In [70]:
month = newsarticlesdf2['Month full']
year = newsarticlesdf2['year']

monthbyyear = newsarticlesdf2.groupby([month,year]).size()
monthbyyear = monthbyyear.rename_axis(['Month','Year']).unstack('Month').reindex(columns = order_month)
monthbyyear
Out[70]:
Month January February March April May June July August September October November December
Year
2016 1151 1144 1220 1091 1175 1205 1136 962 1049 1074 1141 894
2017 965 834 951 701 884 937 806 690 742 789 819 689
2018 652 617 736 710 728 648 696 642 639 724 771 596
2019 717 685 776 686 786 710 726 679 741 894 918 794
In [71]:
sns.heatmap(monthbyyear,cmap = 'coolwarm')
plt.title('Heatmap - count of number of headlines published per month and year')
Out[71]:
Text(0.5,1,'Heatmap - count of number of headlines published per month and year')

Comments
There was a spike in the number of articles published in November, potentially due to the cororna pandemic which started taking pace during November.

In [72]:
#Sort the df by publish date and reset the index
newsarticlesdf2.sort_values(by = 'PublishDate',inplace = True)
newsarticlesdf2.reset_index(drop = True, inplace = True)
newsarticlesdf2.head()
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
Out[72]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime month Month full year
0 Tamir Rice protesters picket house of Clevelan... <p>Anger over the decision <a href="http://www... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12+00:00 2016-01-01 22:44:12 1 January 2016
1 Novel about Jewish-Palestinian love affair is ... <p>A novel about a love affair between a Jewis... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07+00:00 2016-01-01 01:04:07 1 January 2016
2 Former ABC managing director Brian Johns dies ... <p>Former ABC managing director Brian Johns ha... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12+00:00 2016-01-01 01:22:12 1 January 2016
3 Hillary Clinton was told Angela Merkel is agai... <p>Hillary Clinton was informed that German ch... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07+00:00 2016-01-01 04:36:07 1 January 2016
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46+00:00 2016-01-01 05:55:46 1 January 2016
In [73]:
newsarticlesdf2.describe()
Out[73]:
month year
count 40320.000000 40320.000000
mean 6.428745 2017.325918
std 3.448247 1.152631
min 1.000000 2016.000000
25% 3.000000 2016.000000
50% 6.000000 2017.000000
75% 10.000000 2018.000000
max 12.000000 2019.000000
In [74]:
#WordCount and Character count from the Guardian API is for the body of the text.
#Create a new column for the character and word count of the headline
newsarticlesdf2['headline_text_count'] = newsarticlesdf2['headline'].apply(lambda x: len(str(x).split(" ")))
newsarticlesdf2['headline_char_count'] = newsarticlesdf2['headline'].str.len()
newsarticlesdf2.head()
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
Out[74]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime month Month full year headline_text_count headline_char_count
0 Tamir Rice protesters picket house of Clevelan... <p>Anger over the decision <a href="http://www... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12+00:00 2016-01-01 22:44:12 1 January 2016 10 74
1 Novel about Jewish-Palestinian love affair is ... <p>A novel about a love affair between a Jewis... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07+00:00 2016-01-01 01:04:07 1 January 2016 10 76
2 Former ABC managing director Brian Johns dies ... <p>Former ABC managing director Brian Johns ha... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12+00:00 2016-01-01 01:22:12 1 January 2016 9 53
3 Hillary Clinton was told Angela Merkel is agai... <p>Hillary Clinton was informed that German ch... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07+00:00 2016-01-01 04:36:07 1 January 2016 10 68
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46+00:00 2016-01-01 05:55:46 1 January 2016 13 72
In [75]:
newsarticlesdf2.describe()
Out[75]:
month year headline_text_count headline_char_count
count 40320.000000 40320.000000 40320.000000 40320.000000
mean 6.428745 2017.325918 10.900446 67.019767
std 3.448247 1.152631 2.540568 13.328471
min 1.000000 2016.000000 2.000000 12.000000
25% 3.000000 2016.000000 9.000000 60.000000
50% 6.000000 2017.000000 11.000000 68.000000
75% 10.000000 2018.000000 12.000000 75.000000
max 12.000000 2019.000000 25.000000 140.000000
In [76]:
plt.figure(figsize=(10,6))
sns.countplot(newsarticlesdf2['headline_text_count'])
plt.title('Count of number headlines per word count')
Out[76]:
Text(0.5,1,'Count of number headlines per word count')
In [77]:
plt.figure(figsize=(15,8))
sns.countplot(newsarticlesdf2['headline_char_count'])
plt.title('Count of number of characters in a headline')
Out[77]:
Text(0.5,1,'Count of number of characters in a headline')

Comments
Headlines generally have less words, to grab the readers attention. The average number of words in a headline is 6 words or less, which can be seen in the graphs above [23].

In [78]:
headlines_to_drop = ['Student News:','Newsdesk','in pictures','Picture desk','best photographs','world in pictures','photo highlights','Your photographs of',': 23032007','Student News','Daily Newsdesk','podcast:','StudentNews1:','News quiz:','Helen Boden:','Milling Around','52 weeks:','Underwater photography:','The Illustrated London News','Photo highlights','picture of the day','photo of the day','Eyewitness:','photographer of the year','Video:',' photographs of the day','video:','pictures of the day','Xan Brooks','Activate 2011:','Media Talk:','MediaTalk:','Mediatalk:','Media talk:','In pictures:','365 days:','366 days','The Guardian Essential Report','Mediatalk','MediaTalk','quiz:','tweets:','weekly:','Weekly:','quiz of']
newsarticlesdf3 = newsarticlesdf2[~newsarticlesdf2.headline.str.contains('|'.join(headlines_to_drop))]
In [79]:
newsarticlesdf3.shape
Out[79]:
(40265, 15)

Comments
Remove the above words listed in headlines_to_drop as these headlines may not have an impact on the stock market. In order to precisely forecast the stock price using the news articles containing photos, image processing could be used to see what photos were released.

In [80]:
#Remove time zone from the datetime column - can't export file where timezones are present - not supported by excel.
newsarticlesdf3['PublicationDate']= newsarticlesdf3['PublicationDate'].dt.tz_localize(None)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [81]:
newsarticlesdf3.head()
Out[81]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime month Month full year headline_text_count headline_char_count
0 Tamir Rice protesters picket house of Clevelan... <p>Anger over the decision <a href="http://www... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 1 January 2016 10 74
1 Novel about Jewish-Palestinian love affair is ... <p>A novel about a love affair between a Jewis... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 1 January 2016 10 76
2 Former ABC managing director Brian Johns dies ... <p>Former ABC managing director Brian Johns ha... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 1 January 2016 9 53
3 Hillary Clinton was told Angela Merkel is agai... <p>Hillary Clinton was informed that German ch... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 1 January 2016 10 68
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 1 January 2016 13 72
In [82]:
newsarticlesdf4 = newsarticlesdf3
newsarticlesdf4.shape
Out[82]:
(40265, 15)
In [83]:
newsarticlesdf4['wordcount'].unique()
Out[83]:
array(['627', '614', '412', ..., '2380', '1803', '11442'], dtype=object)
In [84]:
#newsarticlesdf4.to_excel('newsarticlesdf4.xlsx')
In [85]:
#newsarticlesdf4 = pd.read_excel('newsarticlesdf4.xlsx')
In [86]:
newsarticlesdf4.shape
Out[86]:
(40265, 15)
In [87]:
newsarticlesdf4.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40265 entries, 0 to 40319
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   headline             40265 non-null  object        
 1   body                 40265 non-null  object        
 2   charCount            40265 non-null  object        
 3   wordcount            40265 non-null  object        
 4   lastModified         40265 non-null  object        
 5   publication          40265 non-null  object        
 6   webPublicationDate   40265 non-null  object        
 7   PublicationDate      40265 non-null  datetime64[ns]
 8   PublishDate          40265 non-null  datetime64[ns]
 9   PublishTime          40265 non-null  object        
 10  month                40265 non-null  int64         
 11  Month full           40265 non-null  object        
 12  year                 40265 non-null  int64         
 13  headline_text_count  40265 non-null  int64         
 14  headline_char_count  40265 non-null  int64         
dtypes: datetime64[ns](2), int64(4), object(9)
memory usage: 4.9+ MB
In [88]:
newsarticlesdf4['headline'] = newsarticlesdf4['headline'].astype('str')
newsarticlesdf4['body']=newsarticlesdf4['body'].astype('str')
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [89]:
from wordcloud import WordCloud, STOPWORDS,ImageColorGenerator
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer
from textblob import TextBlob
from PIL import Image
from os import path

Clean text in body

In [91]:
#code adapted from Data Science Blog [24]
#remove <br> </br> <p> </p> <b> </b> - html tags which were present in the body
newsarticlesdf4 = newsarticlesdf4.replace('<br>', '',regex = True)
newsarticlesdf4 = newsarticlesdf4.replace('</br>', '',regex = True)
newsarticlesdf4 = newsarticlesdf4.replace('<b>', '',regex = True)
newsarticlesdf4 = newsarticlesdf4.replace('</b>', '',regex = True)
newsarticlesdf4 = newsarticlesdf4.replace('<p>', '',regex = True)
newsarticlesdf4 = newsarticlesdf4.replace('</p>', '',regex = True)
newsarticlesdf4.head()
Out[91]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime month Month full year headline_text_count headline_char_count
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 1 January 2016 10 74
1 Novel about Jewish-Palestinian love affair is ... A novel about a love affair between a Jewish w... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 1 January 2016 10 76
2 Former ABC managing director Brian Johns dies ... Former ABC managing director Brian Johns has d... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 1 January 2016 9 53
3 Hillary Clinton was told Angela Merkel is agai... Hillary Clinton was informed that German chanc... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 1 January 2016 10 68
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 1 January 2016 13 72
In [92]:
#Use newsarticlesdf4 which contains headlines and body text
#convert text to lowercase
newsarticlesdf4['headline_clean'] = newsarticlesdf4['headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))
newsarticlesdf4['body_clean'] = newsarticlesdf4['body'].apply(lambda x: " ".join(x.lower() for x in x.split()))
newsarticlesdf4.head()
Out[92]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime month Month full year headline_text_count headline_char_count headline_clean body_clean
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 1 January 2016 10 74 tamir rice protesters picket house of clevelan... anger over the decision <a href="http://www.th...
1 Novel about Jewish-Palestinian love affair is ... A novel about a love affair between a Jewish w... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 1 January 2016 10 76 novel about jewish-palestinian love affair is ... a novel about a love affair between a jewish w...
2 Former ABC managing director Brian Johns dies ... Former ABC managing director Brian Johns has d... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 1 January 2016 9 53 former abc managing director brian johns dies ... former abc managing director brian johns has d...
3 Hillary Clinton was told Angela Merkel is agai... Hillary Clinton was informed that German chanc... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 1 January 2016 10 68 hillary clinton was told angela merkel is agai... hillary clinton was informed that german chanc...
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 1 January 2016 13 72 munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class...
In [93]:
#remove special characters from the text
newsarticlesdf4['headline_clean'] = newsarticlesdf4['headline_clean'].str.replace('[^\w\s]','')
newsarticlesdf4['body_clean'] = newsarticlesdf4['body_clean'].str.replace('[^\w\s]','')
newsarticlesdf4.head()
Out[93]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime month Month full year headline_text_count headline_char_count headline_clean body_clean
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 1 January 2016 10 74 tamir rice protesters picket house of clevelan... anger over the decision a hrefhttpwwwtheguardi...
1 Novel about Jewish-Palestinian love affair is ... A novel about a love affair between a Jewish w... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 1 January 2016 10 76 novel about jewishpalestinian love affair is b... a novel about a love affair between a jewish w...
2 Former ABC managing director Brian Johns dies ... Former ABC managing director Brian Johns has d... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 1 January 2016 9 53 former abc managing director brian johns dies ... former abc managing director brian johns has d...
3 Hillary Clinton was told Angela Merkel is agai... Hillary Clinton was informed that German chanc... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 1 January 2016 10 68 hillary clinton was told angela merkel is agai... hillary clinton was informed that german chanc...
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 1 January 2016 13 72 munich police warned of attack by five to seve... div idblock568611b9e4b0073bf25b8dfe classblock...
In [94]:
#Stopwords - remove stop words
stop = stopwords.words('english') #includes words such as I,my,we
#Extend the stop words to remove the below. These words will appear a lot in the headlines, which means 
#we would not be able to see the more important words in the headlines
stop.extend(['news','new','from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come','media','photo','photography','bn'])
newsarticlesdf4['headline_clean_stop'] = newsarticlesdf4['headline_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
newsarticlesdf4['body_clean_stop'] = newsarticlesdf4['body_clean'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

newsarticlesdf4.head()
Out[94]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime month Month full year headline_text_count headline_char_count headline_clean body_clean headline_clean_stop body_clean_stop
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 1 January 2016 10 74 tamir rice protesters picket house of clevelan... anger over the decision a hrefhttpwwwtheguardi... tamir rice protesters picket house cleveland p... anger decision hrefhttpwwwtheguardiancomusnews...
1 Novel about Jewish-Palestinian love affair is ... A novel about a love affair between a Jewish w... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 1 January 2016 10 76 novel about jewishpalestinian love affair is b... a novel about a love affair between a jewish w... novel jewishpalestinian love affair barred isr... novel love affair jewish woman palestinian man...
2 Former ABC managing director Brian Johns dies ... Former ABC managing director Brian Johns has d... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 1 January 2016 9 53 former abc managing director brian johns dies ... former abc managing director brian johns has d... former abc managing director brian johns dies ... former abc managing director brian johns died ...
3 Hillary Clinton was told Angela Merkel is agai... Hillary Clinton was informed that German chanc... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 1 January 2016 10 68 hillary clinton was told angela merkel is agai... hillary clinton was informed that german chanc... hillary clinton told angela merkel obama pheno... hillary clinton informed german chancellor ang...
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 1 January 2016 13 72 munich police warned of attack by five to seve... div idblock568611b9e4b0073bf25b8dfe classblock... munich police warned attack five seven suicide... div idblock568611b9e4b0073bf25b8dfe classblock...
In [95]:
#Create word cloud
text = " ".join(headlines for headlines in newsarticlesdf4.headline)
print ("There are {} words in the combination of all headlines.".format(len(text)))
There are 2739517 words in the combination of all headlines.
In [96]:
stopwords = ['BBC','say','new','News','media','Today','will','ad','says','news','from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come','media','photo','photography','bn'] + list(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

plt.figure(figsize = (15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud of all headlines')
plt.show()

The time series plots of the FTSE 100 show four distinct periods, hence investigate the words presents for these periods:

  • 2016
  • 2017
  • 2018
  • 2019
In [97]:
news2016 = newsarticlesdf4.loc[(newsarticlesdf4['year']>= 2016) &(newsarticlesdf4['year']< 2017)]
news2017 = newsarticlesdf4.loc[(newsarticlesdf4['year']>= 2017) &(newsarticlesdf4['year']< 2018)]
news2018 = newsarticlesdf4.loc[(newsarticlesdf4['year']>= 2018) &(newsarticlesdf4['year']< 2019)]
news2019 = newsarticlesdf4.loc[(newsarticlesdf4['year']>= 2019) &(newsarticlesdf4['year']< 2020)]
In [98]:
#Create word cloud
text2016 = " ".join(headlines for headlines in news2016.headline)
print ("There are {} words in the combination of all headlines in 2016".format(len(text2016)))
There are 886045 words in the combination of all headlines in 2016
In [99]:
wordcloud2016 = WordCloud(stopwords=stopwords, background_color="white", collocations = False).generate(text2016)

plt.figure(figsize = (10,10))
plt.imshow(wordcloud2016, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for 2016')
plt.show()
In [100]:
#Create word cloud
text2017 = " ".join(headlines for headlines in news2017.headline)
print ("There are {} words in the combination of all headlines in 2017.".format(len(text2017)))
There are 668889 words in the combination of all headlines in 2017.
In [101]:
wordcloud2017 = WordCloud(stopwords=stopwords, background_color="white", collocations = False).generate(text2017)

plt.figure(figsize = (10,10))
plt.imshow(wordcloud2017, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for 2017')
plt.show()
In [102]:
#Create word cloud
text2018 = " ".join(headlines for headlines in news2018.headline)
print ("There are {} words in the combination of all headlines in 2018.".format(len(text2018)))
There are 553906 words in the combination of all headlines in 2018.
In [103]:
wordcloud2018 = WordCloud(stopwords=stopwords, background_color="white", collocations = False).generate(text2018)

plt.figure(figsize = (10,10))
plt.imshow(wordcloud2018, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for 2018')
plt.show()
In [104]:
#Create word cloud
text2019 = " ".join(headlines for headlines in news2019.headline)
print ("There are {} words in the combination of all headlines in 2019.".format(len(text2019)))
There are 630674 words in the combination of all headlines in 2019.
In [105]:
wordcloud2019 = WordCloud(stopwords=stopwords, background_color="white", collocations = False).generate(text2019)

plt.figure(figsize = (10,10))
plt.imshow(wordcloud2019, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for 2019')
plt.show()

Comments

There are four distinct periods when looking at the stock indices graphs

  • 2016 - Clinton
  • 2017 - Trump
  • 2018 - Trump
  • 2019 - Brexit

Text Blob

In [106]:
def detect_polarity(text):
    return TextBlob(text).sentiment.polarity
newsarticlesdf4['polarity_headline'] = newsarticlesdf4.headline_clean_stop.apply(detect_polarity)
newsarticlesdf4['polarity_body'] = newsarticlesdf4.body_clean_stop.apply(detect_polarity)
newsarticlesdf4.head()
Out[106]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime ... Month full year headline_text_count headline_char_count headline_clean body_clean headline_clean_stop body_clean_stop polarity_headline polarity_body
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 ... January 2016 10 74 tamir rice protesters picket house of clevelan... anger over the decision a hrefhttpwwwtheguardi... tamir rice protesters picket house cleveland p... anger decision hrefhttpwwwtheguardiancomusnews... 0.000000 0.176356
1 Novel about Jewish-Palestinian love affair is ... A novel about a love affair between a Jewish w... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 ... January 2016 10 76 novel about jewishpalestinian love affair is b... a novel about a love affair between a jewish w... novel jewishpalestinian love affair barred isr... novel love affair jewish woman palestinian man... 0.500000 0.115833
2 Former ABC managing director Brian Johns dies ... Former ABC managing director Brian Johns has d... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 ... January 2016 9 53 former abc managing director brian johns dies ... former abc managing director brian johns has d... former abc managing director brian johns dies ... former abc managing director brian johns died ... -0.050000 0.119396
3 Hillary Clinton was told Angela Merkel is agai... Hillary Clinton was informed that German chanc... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 ... January 2016 10 68 hillary clinton was told angela merkel is agai... hillary clinton was informed that german chanc... hillary clinton told angela merkel obama pheno... hillary clinton informed german chancellor ang... 0.000000 0.029613
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 ... January 2016 13 72 munich police warned of attack by five to seve... div idblock568611b9e4b0073bf25b8dfe classblock... munich police warned attack five seven suicide... div idblock568611b9e4b0073bf25b8dfe classblock... 0.136364 0.148593

5 rows × 21 columns

In [107]:
def detect_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity
newsarticlesdf4['subjectivity_headline'] = newsarticlesdf4.headline_clean_stop.apply(detect_subjectivity)
newsarticlesdf4['subjectivity_body'] = newsarticlesdf4.body_clean_stop.apply(detect_subjectivity)
newsarticlesdf4.head()
Out[107]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime ... headline_text_count headline_char_count headline_clean body_clean headline_clean_stop body_clean_stop polarity_headline polarity_body subjectivity_headline subjectivity_body
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 ... 10 74 tamir rice protesters picket house of clevelan... anger over the decision a hrefhttpwwwtheguardi... tamir rice protesters picket house cleveland p... anger decision hrefhttpwwwtheguardiancomusnews... 0.000000 0.176356 0.0 0.598280
1 Novel about Jewish-Palestinian love affair is ... A novel about a love affair between a Jewish w... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 ... 10 76 novel about jewishpalestinian love affair is b... a novel about a love affair between a jewish w... novel jewishpalestinian love affair barred isr... novel love affair jewish woman palestinian man... 0.500000 0.115833 0.6 0.365417
2 Former ABC managing director Brian Johns dies ... Former ABC managing director Brian Johns has d... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 ... 9 53 former abc managing director brian johns dies ... former abc managing director brian johns has d... former abc managing director brian johns dies ... former abc managing director brian johns died ... -0.050000 0.119396 0.2 0.368978
3 Hillary Clinton was told Angela Merkel is agai... Hillary Clinton was informed that German chanc... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 ... 10 68 hillary clinton was told angela merkel is agai... hillary clinton was informed that german chanc... hillary clinton told angela merkel obama pheno... hillary clinton informed german chancellor ang... 0.000000 0.029613 0.0 0.396207
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 ... 13 72 munich police warned of attack by five to seve... div idblock568611b9e4b0073bf25b8dfe classblock... munich police warned attack five seven suicide... div idblock568611b9e4b0073bf25b8dfe classblock... 0.136364 0.148593 0.5 0.462881

5 rows × 23 columns

In [108]:
#Distribution of Polarity
num_bins = 20
plt.figure(figsize=(10,6))
n, bins, patches = plt.hist(newsarticlesdf4.polarity_headline, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Polarity')
plt.ylabel('Count')
plt.title('Histogram of polarity')
plt.show();
In [109]:
#Distribution of subjectivity headline
num_bins = 20
plt.figure(figsize=(10,6))
n, bins, patches = plt.hist(newsarticlesdf4.subjectivity_headline, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Subjectivity')
plt.ylabel('Count')
plt.title('Histogram of Subjectivity of Headline')
plt.show();
In [110]:
#Distribution of Polarity
num_bins = 20
plt.figure(figsize=(10,6))
n, bins, patches = plt.hist(newsarticlesdf4.polarity_body, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Polarity')
plt.ylabel('Count')
plt.title('Histogram of polarity')
plt.show();
In [111]:
#Distribution of subjectivity of body 
num_bins = 20
plt.figure(figsize=(10,6))
n, bins, patches = plt.hist(newsarticlesdf4.subjectivity_body, num_bins, facecolor='blue', alpha=0.5)
plt.xlabel('Subjectivity')
plt.ylabel('Count')
plt.title('Histogram of polarity')
plt.show();

Comments
TextBlob does not show any meaningful results for the sentiment of the headline. A number of articles have been classified as neutral when they should not be. VADER will be used instead to calculate the sentiment of the headline. TextBlob show some results for the sentiment of the body of the article, however due to computational resources, the body of the article is not in the scope of this investigation.

VADER

In [112]:
#Import vader package
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
In [113]:
#apply vader sentiment to output columns neg,pos,neu and compound. Code adapted from [27]
sentiment = newsarticlesdf4['headline'].apply(lambda x: analyzer.polarity_scores(x))
newsarticlesdf4 = pd.concat([newsarticlesdf4,sentiment.apply(pd.Series)],1)
newsarticlesdf4.head()
Out[113]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime ... headline_clean_stop body_clean_stop polarity_headline polarity_body subjectivity_headline subjectivity_body neg neu pos compound
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 ... tamir rice protesters picket house cleveland p... anger decision hrefhttpwwwtheguardiancomusnews... 0.000000 0.176356 0.0 0.598280 0.174 0.826 0.000 -0.2263
1 Novel about Jewish-Palestinian love affair is ... A novel about a love affair between a Jewish w... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 ... novel jewishpalestinian love affair barred isr... novel love affair jewish woman palestinian man... 0.500000 0.115833 0.6 0.365417 0.000 0.552 0.448 0.7579
2 Former ABC managing director Brian Johns dies ... Former ABC managing director Brian Johns has d... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 ... former abc managing director brian johns dies ... former abc managing director brian johns died ... -0.050000 0.119396 0.2 0.368978 0.000 1.000 0.000 0.0000
3 Hillary Clinton was told Angela Merkel is agai... Hillary Clinton was informed that German chanc... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 ... hillary clinton told angela merkel obama pheno... hillary clinton informed german chancellor ang... 0.000000 0.029613 0.0 0.396207 0.000 1.000 0.000 0.0000
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 ... munich police warned attack five seven suicide... div idblock568611b9e4b0073bf25b8dfe classblock... 0.136364 0.148593 0.5 0.462881 0.492 0.508 0.000 -0.8658

5 rows × 27 columns

In [114]:
#Drop neg neu and pos column from the dataset, only interested in the Compound sentiment analysis score
newsarticlesdf4 = newsarticlesdf4.drop(['neg','neu','pos'],axis = 1)
newsarticlesdf4.head()
Out[114]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime ... headline_char_count headline_clean body_clean headline_clean_stop body_clean_stop polarity_headline polarity_body subjectivity_headline subjectivity_body compound
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 ... 74 tamir rice protesters picket house of clevelan... anger over the decision a hrefhttpwwwtheguardi... tamir rice protesters picket house cleveland p... anger decision hrefhttpwwwtheguardiancomusnews... 0.000000 0.176356 0.0 0.598280 -0.2263
1 Novel about Jewish-Palestinian love affair is ... A novel about a love affair between a Jewish w... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 ... 76 novel about jewishpalestinian love affair is b... a novel about a love affair between a jewish w... novel jewishpalestinian love affair barred isr... novel love affair jewish woman palestinian man... 0.500000 0.115833 0.6 0.365417 0.7579
2 Former ABC managing director Brian Johns dies ... Former ABC managing director Brian Johns has d... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 ... 53 former abc managing director brian johns dies ... former abc managing director brian johns has d... former abc managing director brian johns dies ... former abc managing director brian johns died ... -0.050000 0.119396 0.2 0.368978 0.0000
3 Hillary Clinton was told Angela Merkel is agai... Hillary Clinton was informed that German chanc... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 ... 68 hillary clinton was told angela merkel is agai... hillary clinton was informed that german chanc... hillary clinton told angela merkel obama pheno... hillary clinton informed german chancellor ang... 0.000000 0.029613 0.0 0.396207 0.0000
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 ... 72 munich police warned of attack by five to seve... div idblock568611b9e4b0073bf25b8dfe classblock... munich police warned attack five seven suicide... div idblock568611b9e4b0073bf25b8dfe classblock... 0.136364 0.148593 0.5 0.462881 -0.8658

5 rows × 24 columns

In [115]:
#Add column to distinguish if the sentiment is positive,negative or neutral

def sentiment_class(num):
    if num > 0:
        return 'Positive'
    elif num <0:
        return 'Negative'
    else:
        return 'Neutral'

newsarticlesdf4['VaderSentiment'] = newsarticlesdf4['compound'].apply(sentiment_class)

newsarticlesdf4.head(1)
Out[115]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime ... headline_clean body_clean headline_clean_stop body_clean_stop polarity_headline polarity_body subjectivity_headline subjectivity_body compound VaderSentiment
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 ... tamir rice protesters picket house of clevelan... anger over the decision a hrefhttpwwwtheguardi... tamir rice protesters picket house cleveland p... anger decision hrefhttpwwwtheguardiancomusnews... 0.0 0.176356 0.0 0.59828 -0.2263 Negative

1 rows × 25 columns

Q. Are the news headlines predominantly positive, negative or neutral?
In [117]:
plt.figure(figsize=(10,6))
sns.countplot(newsarticlesdf4['year'],hue = newsarticlesdf4['VaderSentiment'])
plt.title('Number of headlines per year split by VaderSentiment')
Out[117]:
Text(0.5,1,'Number of headlines per year split by VaderSentiment')
In [128]:
#Create a dataset from year 2016 onwards and negative sentiment 
news2016 = newsarticlesdf4.loc[(newsarticlesdf4['year']== 2016) &(newsarticlesdf4['VaderSentiment'] == 'Positive')]
news2016.shape
Out[128]:
(3462, 25)
In [129]:
#Create word cloud
text2016 = " ".join(headlines for headlines in news2016.headline_clean_stop)

wordcloud2016 = WordCloud(stopwords=stopwords, background_color="white", collocations = False).generate(text2016)

plt.figure(figsize = (10,10))
plt.imshow(wordcloud2016, interpolation='bilinear')
plt.axis("off")
plt.title('Word Cloud for negative sentiment October to December')
plt.show()

Comments
Digging deeper into the increase in negative sentiment headlines in 2016, the words associated with these headlines are 'fire' ,'attack' ,'labour','death'. There has been an increase in number of pandemics recently as well as Brexit causing uncertainty.

Topic Modelling and LDA

Q. Can news headlines be classified into topics reliably?
In [132]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
import spacy

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
[nltk_data] Downloading package wordnet to C:\Users\Shivam
[nltk_data]     Dixit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
In [133]:
#stem words to the root of each word e.g. publishes becomes publish. Code adapted from StackOverFlor [29]

def stem_word(text):
    tokens = text.split()
    stemmed_tokens = [PorterStemmer().stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

newsarticlesdf4['headline_clean_stop_stem'] = newsarticlesdf4['headline_clean_stop'].apply(stem_word)
newsarticlesdf4.head()
Out[133]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime ... body_clean headline_clean_stop body_clean_stop polarity_headline polarity_body subjectivity_headline subjectivity_body compound VaderSentiment headline_clean_stop_stem
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 ... anger over the decision a hrefhttpwwwtheguardi... tamir rice protesters picket house cleveland p... anger decision hrefhttpwwwtheguardiancomusnews... 0.000000 0.176356 0.0 0.598280 -0.2263 Negative tamir rice protest picket hous cleveland prose...
1 Novel about Jewish-Palestinian love affair is ... A novel about a love affair between a Jewish w... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 ... a novel about a love affair between a jewish w... novel jewishpalestinian love affair barred isr... novel love affair jewish woman palestinian man... 0.500000 0.115833 0.6 0.365417 0.7579 Positive novel jewishpalestinian love affair bar isra c...
2 Former ABC managing director Brian Johns dies ... Former ABC managing director Brian Johns has d... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 ... former abc managing director brian johns has d... former abc managing director brian johns dies ... former abc managing director brian johns died ... -0.050000 0.119396 0.2 0.368978 0.0000 Neutral former abc manag director brian john die age 79
3 Hillary Clinton was told Angela Merkel is agai... Hillary Clinton was informed that German chanc... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 ... hillary clinton was informed that german chanc... hillary clinton told angela merkel obama pheno... hillary clinton informed german chancellor ang... 0.000000 0.029613 0.0 0.396207 0.0000 Neutral hillari clinton told angela merkel obama pheno...
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 ... div idblock568611b9e4b0073bf25b8dfe classblock... munich police warned attack five seven suicide... div idblock568611b9e4b0073bf25b8dfe classblock... 0.136364 0.148593 0.5 0.462881 -0.8658 Negative munich polic warn attack five seven suicid bom...

5 rows × 26 columns

In [134]:
#Column headline_clean_stop excludes stopwords and punctuation
#Create list of headline clean text

headline_text = newsarticlesdf4.headline_clean_stop_stem.values.tolist()
print(headline_text[:1])
['tamir rice protest picket hous cleveland prosecutor timothi mcginti']
In [135]:
#Code for LDA modelling has been adapted from DataSkunkWorks [30] and Machine Learning Plus [32]
#Tokenize words which is required by LDA 
def doc_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence),deacc = True))

headlinewords = list(doc_words(headline_text))

print(headlinewords[:1])
[['tamir', 'rice', 'protest', 'picket', 'hous', 'cleveland', 'prosecutor', 'timothi', 'mcginti']]
In [136]:
import gensim.corpora as corpora
id2word = corpora.Dictionary(headlinewords)
 
# Create Corpus
corpus = [id2word.doc2bow(text) for text in headlinewords]
 
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           per_word_topics=True)
In [137]:
# Compute Perplexity score
print('\nPerplexity: ', lda_model.log_perplexity(corpus))

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=headlinewords, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)
Perplexity:  -8.587750507777455

Coherence Score:  0.32672386948844057
In [138]:
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
[(0,
  '0.032*"brexit" + 0.023*"happen" + 0.015*"johnson" + 0.014*"say" + '
  '0.013*"trump" + 0.013*"labour" + 0.011*"brief" + 0.011*"impeach" + '
  '0.011*"uk" + 0.010*"bori"'),
 (1,
  '0.010*"polic" + 0.008*"trump" + 0.006*"attack" + 0.006*"fire" + '
  '0.006*"court" + 0.006*"die" + 0.005*"man" + 0.005*"death" + 0.005*"offic" + '
  '0.005*"murder"'),
 (2,
  '0.011*"us" + 0.008*"say" + 0.006*"stori" + 0.005*"australia" + '
  '0.005*"guardian" + 0.005*"blaze" + 0.005*"crisi" + 0.005*"climat" + '
  '0.005*"trump" + 0.004*"paper"'),
 (3,
  '0.023*"trump" + 0.018*"happen" + 0.011*"elect" + 0.011*"us" + '
  '0.009*"bushfir" + 0.008*"year" + 0.007*"record" + 0.006*"mail" + 0.006*"uk" '
  '+ 0.006*"climat"')]
In [139]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis
Out[139]:

Comments

From the visualisation above we see that even when setting the number of topics to 4. In addition, the coherence score of the model is quite low, indicating that this model may not be optimal.
Note
As this is an unsupervised algorithm the topic classifcations change each time.

In [140]:
# Find the optimal number of topics for LDA using gensim
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word,per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
In [141]:
#Find optimal number of topics - TAKES LONG TO RUN
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=headlinewords, start=2, limit=60, step=6)
In [142]:
#Show graph
limit=60; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.title('Finding the optimal number of topics')
plt.show()
In [143]:
# Print the coherence scores
for y, cv in zip(x, coherence_values):
    print("Num Topics =", y, " has Coherence Value of", round(cv, 4))
Num Topics = 2  has Coherence Value of 0.2904
Num Topics = 8  has Coherence Value of 0.2984
Num Topics = 14  has Coherence Value of 0.4093
Num Topics = 20  has Coherence Value of 0.3999
Num Topics = 26  has Coherence Value of 0.3928
Num Topics = 32  has Coherence Value of 0.4271
Num Topics = 38  has Coherence Value of 0.4645
Num Topics = 44  has Coherence Value of 0.4802
Num Topics = 50  has Coherence Value of 0.5037
Num Topics = 56  has Coherence Value of 0.4982

Comments
Coherence allows us to judge how good the topic model is. From the graph above, the coherence score is at its highest at 50 topics. We will change the LDA topic number to 50 below and visualize them.

In [144]:
id2word = corpora.Dictionary(headlinewords)
 
#Create Corpus
corpus = [id2word.doc2bow(text) for text in headlinewords]
 
#Build LDA model
lda_model_optimise = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                          id2word=id2word,
                                          num_topics=50, 
                                          per_word_topics=True)
In [145]:
from pprint import pprint
pprint(lda_model_optimise.print_topics())
doc_lda_optimise = lda_model_optimise[corpus]
[(44,
  '0.105*"peopl" + 0.090*"talk" + 0.066*"use" + 0.056*"countri" + '
  '0.055*"across" + 0.054*"quit" + 0.044*"point" + 0.029*"feel" + '
  '0.026*"german" + 0.026*"radio"'),
 (23,
  '0.072*"public" + 0.058*"work" + 0.050*"left" + 0.049*"best" + '
  '0.045*"social" + 0.042*"scientist" + 0.041*"worker" + 0.039*"demand" + '
  '0.034*"word" + 0.034*"what"'),
 (22,
  '0.148*"climat" + 0.123*"chang" + 0.058*"question" + 0.054*"major" + '
  '0.050*"senat" + 0.049*"andrew" + 0.036*"look" + 0.026*"control" + '
  '0.025*"gun" + 0.022*"financi"'),
 (27,
  '0.072*"risk" + 0.070*"london" + 0.057*"way" + 0.046*"dont" + 0.045*"near" + '
  '0.039*"investig" + 0.036*"lead" + 0.034*"conserv" + 0.033*"shoot" + '
  '0.031*"find"'),
 (3,
  '0.180*"leader" + 0.108*"threat" + 0.046*"link" + 0.042*"rightw" + '
  '0.036*"address" + 0.032*"femal" + 0.031*"heart" + 0.027*"littl" + '
  '0.027*"lord" + 0.026*"indigen"'),
 (41,
  '0.045*"cost" + 0.043*"drug" + 0.041*"mean" + 0.040*"breach" + 0.039*"water" '
  '+ 0.037*"hunt" + 0.035*"judg" + 0.030*"evacu" + 0.030*"mark" + '
  '0.028*"celebr"'),
 (10,
  '0.124*"fire" + 0.068*"war" + 0.066*"claim" + 0.050*"face" + 0.048*"battl" + '
  '0.048*"record" + 0.047*"two" + 0.040*"victim" + 0.040*"charg" + '
  '0.030*"alleg"'),
 (9,
  '0.096*"bbc" + 0.072*"nsw" + 0.071*"pay" + 0.069*"support" + 0.065*"minist" '
  '+ 0.058*"woman" + 0.045*"japan" + 0.039*"arrest" + 0.033*"resign" + '
  '0.032*"death"'),
 (6,
  '0.104*"law" + 0.060*"michael" + 0.053*"issu" + 0.047*"car" + 0.044*"fox" + '
  '0.038*"avoid" + 0.036*"struggl" + 0.027*"access" + 0.026*"toxic" + '
  '0.022*"phone"'),
 (11,
  '0.127*"protest" + 0.061*"keep" + 0.056*"condemn" + 0.053*"tri" + '
  '0.046*"block" + 0.038*"black" + 0.035*"activist" + 0.031*"go" + '
  '0.028*"wage" + 0.026*"immigr"'),
 (2,
  '0.150*"call" + 0.119*"bushfir" + 0.088*"mail" + 0.065*"voter" + '
  '0.049*"threaten" + 0.031*"elect" + 0.028*"concern" + 0.026*"staff" + '
  '0.024*"un" + 0.023*"role"'),
 (45,
  '0.371*"trump" + 0.055*"donald" + 0.043*"republican" + 0.039*"presid" + '
  '0.038*"happen" + 0.026*"bid" + 0.024*"thursday" + 0.021*"island" + '
  '0.017*"china" + 0.014*"irish"'),
 (25,
  '0.098*"tell" + 0.059*"russia" + 0.051*"former" + 0.049*"ask" + 0.038*"move" '
  '+ 0.037*"ukrain" + 0.035*"seat" + 0.032*"firefight" + 0.032*"wont" + '
  '0.030*"act"'),
 (1,
  '0.112*"polic" + 0.090*"kill" + 0.066*"offic" + 0.064*"murder" + '
  '0.036*"five" + 0.030*"home" + 0.029*"without" + 0.029*"young" + '
  '0.027*"thing" + 0.026*"union"'),
 (40,
  '0.174*"australian" + 0.055*"school" + 0.039*"raid" + 0.036*"build" + '
  '0.035*"recess" + 0.029*"men" + 0.029*"six" + 0.027*"babi" + 0.026*"road" + '
  '0.026*"book"'),
 (4,
  '0.064*"kong" + 0.064*"hong" + 0.063*"fake" + 0.060*"tv" + 0.040*"lost" + '
  '0.038*"defend" + 0.037*"secret" + 0.035*"crash" + 0.035*"saudi" + '
  '0.028*"devast"'),
 (12,
  '0.101*"home" + 0.095*"ban" + 0.070*"nh" + 0.048*"emerg" + 0.045*"push" + '
  '0.040*"secretari" + 0.038*"cultur" + 0.026*"vow" + 0.025*"servic" + '
  '0.023*"seven"'),
 (7,
  '0.168*"australia" + 0.085*"reveal" + 0.050*"border" + 0.045*"zealand" + '
  '0.039*"tuesday" + 0.037*"aid" + 0.033*"terror" + 0.033*"confirm" + '
  '0.029*"cancer" + 0.023*"hour"'),
 (49,
  '0.072*"next" + 0.066*"stop" + 0.062*"lose" + 0.048*"christma" + '
  '0.042*"street" + 0.039*"year" + 0.035*"wall" + 0.032*"nearli" + '
  '0.030*"loom" + 0.027*"star"'),
 (16,
  '0.079*"speech" + 0.061*"america" + 0.059*"scandal" + 0.050*"letter" + '
  '0.049*"york" + 0.048*"defeat" + 0.040*"free" + 0.038*"sale" + '
  '0.033*"econom" + 0.032*"corpor"')]
In [146]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_optimise, corpus, id2word)
vis
Out[146]:

Comments
50 topics shows the highest coherent score which shows the model is optimal. However, when applying 50 topics nearly all topics overlap. Looking at the four topics generated we can't easily give them a label, hence we can't generate topics reliably. Classifying topics for different years may yield better results.

In [147]:
model_list = lda_model.print_topics()
model_list
Out[147]:
[(0,
  '0.032*"brexit" + 0.023*"happen" + 0.015*"johnson" + 0.014*"say" + 0.013*"trump" + 0.013*"labour" + 0.011*"brief" + 0.011*"impeach" + 0.011*"uk" + 0.010*"bori"'),
 (1,
  '0.010*"polic" + 0.008*"trump" + 0.006*"attack" + 0.006*"fire" + 0.006*"court" + 0.006*"die" + 0.005*"man" + 0.005*"death" + 0.005*"offic" + 0.005*"murder"'),
 (2,
  '0.011*"us" + 0.008*"say" + 0.006*"stori" + 0.005*"australia" + 0.005*"guardian" + 0.005*"blaze" + 0.005*"crisi" + 0.005*"climat" + 0.005*"trump" + 0.004*"paper"'),
 (3,
  '0.023*"trump" + 0.018*"happen" + 0.011*"elect" + 0.011*"us" + 0.009*"bushfir" + 0.008*"year" + 0.007*"record" + 0.006*"mail" + 0.006*"uk" + 0.006*"climat"')]
In [148]:
lda_model.show_topic(1)
Out[148]:
[('polic', 0.009887165),
 ('trump', 0.008081143),
 ('attack', 0.0064942758),
 ('fire', 0.0059241825),
 ('court', 0.0059180576),
 ('die', 0.0058108736),
 ('man', 0.0054070735),
 ('death', 0.0053335214),
 ('offic', 0.0045604785),
 ('murder', 0.004531983)]
In [149]:
lda_model[corpus]
Out[149]:
<gensim.interfaces.TransformedCorpus at 0x1b09edaacf8>
In [150]:
lda_model[corpus[1]]
Out[150]:
([(0, 0.03625277), (1, 0.8900083), (2, 0.03781183), (3, 0.03592713)],
 [(9, [1]), (10, [1]), (11, [1]), (12, [1, 2]), (13, [1]), (14, [1])],
 [(9, [(1, 0.9998547)]),
  (10, [(1, 0.99982196)]),
  (11, [(1, 0.9857629)]),
  (12, [(1, 0.985373), (2, 0.011955815)]),
  (13, [(1, 0.99633026)]),
  (14, [(1, 0.99759984)])])
In [151]:
test_topics = lda_model.get_document_topics(corpus[0])
test_topics
Out[151]:
[(0, 0.029377522), (1, 0.55166143), (2, 0.38869834), (3, 0.030262662)]
In [152]:
test_topics_max = max(test_topics, key=lambda x: x[1])
test_topics_max
Out[152]:
(1, 0.55166143)
In [153]:
test_topics2 = pd.DataFrame(list(test_topics_max))
In [154]:
test_topics2 = test_topics2.transpose()
In [155]:
test_topics2.columns=['Topic','Percentage']
test_topics2
Out[155]:
Topic Percentage
0 1.0 0.551661
In [156]:
#Get the topics and their percentage contribution for the headline
get_document_topics = [lda_model.get_document_topics(item) for item in corpus]
In [157]:
document_topics_df = pd.DataFrame(list(get_document_topics))
document_topics_df.head()
Out[157]:
0 1 2 3
0 (0, 0.029378155) (1, 0.5516524) (2, 0.3886798) (3, 0.030289726)
1 (0, 0.03625279) (1, 0.88998854) (2, 0.037831526) (3, 0.03592715)
2 (0, 0.02829543) (1, 0.91458917) (2, 0.02866402) (3, 0.028451368)
3 (0, 0.033161197) (1, 0.034838077) (2, 0.32615036) (3, 0.60585034)
4 (0, 0.02829622) (1, 0.22751729) (2, 0.71855) (3, 0.025636509)
In [158]:
document_topics_df.shape
Out[158]:
(40265, 4)
In [159]:
#Find the maximum percentage contribution for the row to identify the dominant topic
test_max = [max(p,key=lambda x: x[1]) for p in get_document_topics]
test_max
Out[159]:
[(1, 0.5516524),
 (1, 0.88998854),
 (1, 0.91458917),
 (3, 0.60585034),
 (2, 0.71855),
 (3, 0.67101735),
 (0, 0.64200485),
 (2, 0.79911655),
 (2, 0.5072719),
 (3, 0.4994674),
 (1, 0.809836),
 (3, 0.5076817),
 (3, 0.9151102),
 (1, 0.8238311),
 (1, 0.8346884),
 (3, 0.5092951),
 (2, 0.9045886),
 (0, 0.66712403),
 (0, 0.90973127),
 (0, 0.69372475),
 (1, 0.5915839),
 (0, 0.792938),
 (2, 0.8481872),
 (1, 0.5718195),
 (1, 0.49284282),
 (2, 0.6590071),
 (0, 0.8810503),
 (2, 0.5407791),
 (0, 0.52730554),
 (3, 0.79571974),
 (3, 0.74294627),
 (3, 0.9146001),
 (1, 0.5462774),
 (3, 0.6878305),
 (3, 0.9007797),
 (0, 0.3612136),
 (1, 0.7958127),
 (1, 0.8832151),
 (2, 0.71147627),
 (1, 0.5716043),
 (2, 0.7854444),
 (3, 0.39514935),
 (3, 0.7999159),
 (0, 0.89609003),
 (3, 0.6249277),
 (0, 0.5880376),
 (2, 0.931278),
 (0, 0.5096767),
 (0, 0.5886794),
 (2, 0.5080548),
 (3, 0.46169546),
 (1, 0.4779363),
 (1, 0.4049411),
 (0, 0.91502875),
 (3, 0.7499557),
 (2, 0.5252499),
 (1, 0.59551597),
 (3, 0.81153727),
 (3, 0.43844664),
 (1, 0.90884084),
 (3, 0.9043924),
 (0, 0.51939553),
 (1, 0.5161007),
 (1, 0.57850754),
 (3, 0.63567984),
 (3, 0.6958597),
 (2, 0.89401376),
 (0, 0.62047714),
 (2, 0.6004482),
 (0, 0.7210804),
 (3, 0.40345865),
 (2, 0.81464195),
 (2, 0.8318661),
 (2, 0.68658227),
 (2, 0.751066),
 (3, 0.7980033),
 (2, 0.36928567),
 (0, 0.47632712),
 (1, 0.5028559),
 (0, 0.928465),
 (0, 0.37814882),
 (2, 0.519184),
 (3, 0.5962143),
 (2, 0.6875185),
 (2, 0.7590832),
 (0, 0.66688716),
 (2, 0.63375515),
 (3, 0.926587),
 (0, 0.9369089),
 (2, 0.87775034),
 (2, 0.55193895),
 (3, 0.6684143),
 (3, 0.49108094),
 (3, 0.89214724),
 (2, 0.9224095),
 (0, 0.4747371),
 (2, 0.5180193),
 (3, 0.71451837),
 (0, 0.9284656),
 (3, 0.87310076),
 (1, 0.5570586),
 (1, 0.77063835),
 (2, 0.9140913),
 (3, 0.90406674),
 (1, 0.91650695),
 (0, 0.9036633),
 (0, 0.87697554),
 (0, 0.889385),
 (0, 0.9113434),
 (0, 0.56526667),
 (3, 0.772943),
 (3, 0.62960994),
 (1, 0.3721806),
 (2, 0.81246424),
 (2, 0.7607902),
 (1, 0.8266419),
 (0, 0.8992752),
 (3, 0.69037646),
 (2, 0.563504),
 (2, 0.5940869),
 (2, 0.6045768),
 (0, 0.53966624),
 (1, 0.9164467),
 (2, 0.8919133),
 (2, 0.8894664),
 (0, 0.6107588),
 (1, 0.8417948),
 (1, 0.88992965),
 (2, 0.84549624),
 (0, 0.891991),
 (1, 0.56768894),
 (0, 0.7780449),
 (0, 0.522108),
 (1, 0.7146362),
 (2, 0.6024137),
 (2, 0.73960817),
 (3, 0.31519502),
 (0, 0.9224983),
 (2, 0.83120096),
 (2, 0.71792763),
 (3, 0.7051012),
 (1, 0.62429976),
 (3, 0.58282846),
 (1, 0.5077559),
 (0, 0.79219913),
 (0, 0.6576381),
 (1, 0.5182789),
 (3, 0.6218805),
 (3, 0.76769936),
 (0, 0.8796329),
 (2, 0.84432244),
 (1, 0.8421602),
 (0, 0.80895185),
 (2, 0.8534446),
 (2, 0.79546946),
 (3, 0.9315248),
 (2, 0.71432185),
 (3, 0.76117134),
 (0, 0.5618647),
 (1, 0.5434612),
 (0, 0.6705734),
 (3, 0.50071454),
 (0, 0.6474186),
 (3, 0.4820278),
 (0, 0.892344),
 (3, 0.76170355),
 (3, 0.914879),
 (3, 0.92840964),
 (0, 0.9093212),
 (1, 0.9215832),
 (1, 0.46441725),
 (1, 0.8770703),
 (2, 0.84427893),
 (0, 0.91336584),
 (3, 0.6786271),
 (3, 0.6028466),
 (0, 0.70967567),
 (2, 0.6791478),
 (0, 0.9001736),
 (2, 0.5797711),
 (0, 0.93466854),
 (3, 0.6785101),
 (3, 0.58571863),
 (0, 0.44552654),
 (3, 0.88374853),
 (1, 0.5756483),
 (0, 0.56668407),
 (3, 0.7775581),
 (0, 0.8821212),
 (0, 0.90399784),
 (3, 0.5699237),
 (3, 0.91261303),
 (0, 0.61091876),
 (3, 0.44114068),
 (1, 0.8506534),
 (1, 0.4488894),
 (3, 0.9161713),
 (0, 0.91456133),
 (0, 0.5122137),
 (2, 0.5563005),
 (0, 0.61017853),
 (3, 0.8889776),
 (1, 0.50925785),
 (1, 0.43967023),
 (0, 0.8115576),
 (2, 0.4966051),
 (3, 0.55638444),
 (2, 0.7231564),
 (1, 0.89236605),
 (0, 0.90109104),
 (1, 0.91205627),
 (3, 0.5361702),
 (3, 0.55162996),
 (3, 0.49601147),
 (3, 0.7539739),
 (1, 0.6173172),
 (3, 0.5888473),
 (1, 0.7213618),
 (3, 0.90187275),
 (0, 0.8745815),
 (0, 0.90959406),
 (0, 0.92217076),
 (1, 0.848986),
 (1, 0.91542643),
 (1, 0.687303),
 (2, 0.90439093),
 (0, 0.9297647),
 (1, 0.42458293),
 (0, 0.5187419),
 (3, 0.6703397),
 (0, 0.908078),
 (1, 0.44207937),
 (0, 0.8856681),
 (3, 0.6641581),
 (0, 0.59829277),
 (0, 0.67613643),
 (0, 0.5624547),
 (1, 0.46748793),
 (1, 0.90718114),
 (1, 0.5994292),
 (1, 0.49182865),
 (2, 0.75529754),
 (2, 0.41617644),
 (2, 0.81246424),
 (3, 0.5292897),
 (3, 0.7591024),
 (0, 0.89260834),
 (3, 0.9234456),
 (0, 0.8720152),
 (0, 0.80295116),
 (3, 0.48438278),
 (0, 0.90235573),
 (3, 0.7025348),
 (0, 0.5652307),
 (1, 0.65307134),
 (1, 0.7908031),
 (3, 0.4065942),
 (2, 0.8290656),
 (2, 0.71428084),
 (2, 0.8861758),
 (2, 0.8742321),
 (0, 0.67533535),
 (2, 0.70324486),
 (3, 0.92270666),
 (0, 0.6280423),
 (1, 0.8480744),
 (2, 0.8305796),
 (0, 0.50352),
 (1, 0.8468116),
 (2, 0.7862589),
 (2, 0.4394489),
 (0, 0.6281904),
 (0, 0.89050174),
 (3, 0.7658746),
 (3, 0.5529068),
 (0, 0.46812943),
 (2, 0.5439348),
 (2, 0.7833327),
 (0, 0.91363055),
 (1, 0.60019624),
 (1, 0.9137314),
 (2, 0.6436976),
 (0, 0.57647824),
 (1, 0.8706555),
 (2, 0.7766181),
 (1, 0.91584516),
 (1, 0.79956514),
 (1, 0.77021056),
 (0, 0.74302596),
 (2, 0.82599723),
 (2, 0.6845586),
 (2, 0.75329876),
 (1, 0.48636848),
 (3, 0.7534124),
 (2, 0.6097642),
 (0, 0.5203151),
 (2, 0.9107506),
 (0, 0.4213481),
 (1, 0.8130113),
 (2, 0.59234804),
 (3, 0.6015468),
 (0, 0.3860175),
 (1, 0.4764174),
 (1, 0.5760167),
 (2, 0.63852346),
 (2, 0.5384059),
 (0, 0.59657604),
 (1, 0.48215246),
 (3, 0.7404305),
 (0, 0.6466293),
 (1, 0.55626535),
 (0, 0.47392175),
 (2, 0.9260719),
 (3, 0.6249277),
 (0, 0.5569834),
 (3, 0.784702),
 (3, 0.46063557),
 (3, 0.42760345),
 (2, 0.39425737),
 (1, 0.85314775),
 (3, 0.5839806),
 (1, 0.44279963),
 (0, 0.8746711),
 (1, 0.36747244),
 (2, 0.66697854),
 (0, 0.7578508),
 (1, 0.8905918),
 (0, 0.81615216),
 (1, 0.792625),
 (0, 0.8975089),
 (2, 0.6131672),
 (1, 0.8111569),
 (1, 0.9000847),
 (3, 0.79650694),
 (2, 0.648196),
 (3, 0.5894353),
 (0, 0.66568935),
 (0, 0.54268074),
 (2, 0.91002697),
 (2, 0.81246424),
 (3, 0.8036683),
 (1, 0.675324),
 (0, 0.65262794),
 (3, 0.927689),
 (0, 0.901669),
 (2, 0.5717403),
 (0, 0.59680974),
 (1, 0.8367974),
 (0, 0.62340903),
 (2, 0.8585182),
 (3, 0.73477036),
 (1, 0.5642063),
 (2, 0.87969315),
 (0, 0.46848446),
 (2, 0.80550814),
 (2, 0.6080454),
 (1, 0.56655806),
 (2, 0.4900816),
 (3, 0.73185253),
 (1, 0.58286226),
 (2, 0.86184996),
 (1, 0.7574887),
 (0, 0.9283841),
 (1, 0.85855234),
 (3, 0.9301331),
 (1, 0.90567726),
 (1, 0.5328199),
 (3, 0.5098254),
 (1, 0.76045305),
 (1, 0.48725253),
 (1, 0.86920047),
 (2, 0.78975165),
 (1, 0.75158256),
 (2, 0.36180243),
 (3, 0.8161231),
 (0, 0.70624703),
 (0, 0.89458585),
 (2, 0.86985785),
 (2, 0.6414412),
 (3, 0.55282295),
 (1, 0.8804439),
 (3, 0.49650168),
 (1, 0.45113206),
 (2, 0.81246424),
 (3, 0.48135084),
 (1, 0.3593561),
 (0, 0.9049267),
 (2, 0.84869367),
 (3, 0.61921173),
 (3, 0.59248996),
 (1, 0.76194024),
 (1, 0.5487281),
 (0, 0.9090371),
 (2, 0.4355463),
 (3, 0.4592737),
 (3, 0.69260675),
 (3, 0.6046245),
 (0, 0.5862975),
 (0, 0.92163295),
 (3, 0.8766037),
 (3, 0.4771671),
 (3, 0.9152),
 (1, 0.8998376),
 (2, 0.5354391),
 (0, 0.63340753),
 (1, 0.75945026),
 (0, 0.49988133),
 (0, 0.9170299),
 (3, 0.6462004),
 (3, 0.40089732),
 (0, 0.5616137),
 (2, 0.9220981),
 (0, 0.81242526),
 (3, 0.7469604),
 (1, 0.6180393),
 (3, 0.89837915),
 (2, 0.8848913),
 (0, 0.7296403),
 (0, 0.6805659),
 (2, 0.8581911),
 (1, 0.8880925),
 (3, 0.86199695),
 (0, 0.680461),
 (2, 0.9363448),
 (2, 0.43623596),
 (1, 0.9104707),
 (3, 0.72388905),
 (1, 0.92708755),
 (1, 0.79103065),
 (1, 0.54876596),
 (2, 0.81246424),
 (3, 0.6462248),
 (0, 0.9123934),
 (2, 0.7599351),
 (3, 0.59038347),
 (1, 0.620208),
 (2, 0.92302245),
 (0, 0.48481426),
 (2, 0.4454932),
 (1, 0.91143703),
 (0, 0.71058804),
 (1, 0.77121454),
 (3, 0.58950114),
 (1, 0.6592195),
 (0, 0.71458185),
 (3, 0.49523816),
 (2, 0.8863894),
 (3, 0.8937918),
 (3, 0.56215894),
 (2, 0.6523239),
 (2, 0.80539477),
 (2, 0.56861806),
 (3, 0.51209134),
 (0, 0.82489496),
 (0, 0.38044357),
 (3, 0.7066891),
 (1, 0.9129524),
 (0, 0.90525955),
 (3, 0.8021212),
 (2, 0.7297282),
 (2, 0.89740103),
 (3, 0.4967148),
 (3, 0.89840794),
 (2, 0.7282104),
 (1, 0.77186424),
 (0, 0.8783282),
 (1, 0.70615125),
 (0, 0.63106906),
 (0, 0.9016858),
 (2, 0.81926584),
 (3, 0.5986036),
 (0, 0.8576823),
 (3, 0.41855165),
 (1, 0.63579184),
 (0, 0.51098686),
 (1, 0.66854334),
 (3, 0.708612),
 (1, 0.56466734),
 (0, 0.7729153),
 (0, 0.91786623),
 (1, 0.9105997),
 (1, 0.5458856),
 (2, 0.53376245),
 (2, 0.81246424),
 (1, 0.79354817),
 (0, 0.4950013),
 (1, 0.44789496),
 (1, 0.77383715),
 (1, 0.6798853),
 (1, 0.81389874),
 (2, 0.51830935),
 (2, 0.69206625),
 (3, 0.8909959),
 (0, 0.9067182),
 (3, 0.7260149),
 (3, 0.6385301),
 (0, 0.5459285),
 (2, 0.8603006),
 (3, 0.606817),
 (3, 0.4635703),
 (1, 0.8923119),
 (0, 0.5551342),
 (2, 0.9012965),
 (3, 0.47993758),
 (0, 0.9273452),
 (0, 0.8098207),
 (1, 0.4463125),
 (2, 0.8443405),
 (2, 0.5894908),
 (1, 0.42660215),
 (1, 0.43932325),
 (0, 0.8926081),
 (2, 0.81246424),
 (2, 0.6077147),
 (3, 0.60445786),
 (1, 0.74440426),
 (0, 0.610314),
 (2, 0.6629563),
 (0, 0.4335796),
 (0, 0.43072966),
 (3, 0.9071065),
 (3, 0.4235961),
 (1, 0.91012895),
 (1, 0.53857905),
 (0, 0.8875888),
 (2, 0.7592696),
 (1, 0.49750662),
 (3, 0.64319056),
 (3, 0.5324626),
 (0, 0.6996878),
 (0, 0.8184433),
 (3, 0.81091887),
 (2, 0.9125772),
 (2, 0.5075573),
 (1, 0.7771332),
 (1, 0.54534745),
 (0, 0.7293838),
 (1, 0.6461546),
 (0, 0.46526694),
 (0, 0.422411),
 (0, 0.44742545),
 (1, 0.44301483),
 (3, 0.51316434),
 (2, 0.59730464),
 (0, 0.59499246),
 (0, 0.79528415),
 (1, 0.43313202),
 (3, 0.92403066),
 (0, 0.5836882),
 (0, 0.40232515),
 (3, 0.70246255),
 (0, 0.44981015),
 (0, 0.5567534),
 (2, 0.5933135),
 (3, 0.5091171),
 (0, 0.4819178),
 (1, 0.5788625),
 (3, 0.56671304),
 (2, 0.90428436),
 (1, 0.906529),
 (3, 0.78690106),
 (0, 0.58460295),
 (2, 0.6235264),
 (0, 0.71046925),
 (1, 0.683789),
 (0, 0.57412255),
 (1, 0.714699),
 (2, 0.8285323),
 (3, 0.52180004),
 (3, 0.6150022),
 (3, 0.60226613),
 (0, 0.6366984),
 (1, 0.85008085),
 (2, 0.5419823),
 (1, 0.89323765),
 (2, 0.5763522),
 (3, 0.712087),
 (1, 0.78446287),
 (3, 0.512895),
 (3, 0.818744),
 (0, 0.7415499),
 (2, 0.49271315),
 (3, 0.8978494),
 (3, 0.49195603),
 (0, 0.51747316),
 (2, 0.48418844),
 (0, 0.65313107),
 (0, 0.6323882),
 (0, 0.4855401),
 (3, 0.6249277),
 (0, 0.8969572),
 (2, 0.50469446),
 (2, 0.574692),
 (0, 0.4634509),
 (0, 0.78243613),
 (2, 0.88086736),
 (3, 0.49325863),
 (0, 0.7784035),
 (3, 0.51029116),
 (3, 0.745578),
 (0, 0.8910103),
 (2, 0.401059),
 (2, 0.81622696),
 (2, 0.5192207),
 (0, 0.7249459),
 (3, 0.6784963),
 (3, 0.6362745),
 (1, 0.9099927),
 (0, 0.8730997),
 (2, 0.7672998),
 (3, 0.5440781),
 (1, 0.45857272),
 (2, 0.49349156),
 (1, 0.6006136),
 (3, 0.53566366),
 (2, 0.81246424),
 (3, 0.48222974),
 (2, 0.9020503),
 (3, 0.5689076),
 (1, 0.6405165),
 (2, 0.46519923),
 (3, 0.87119675),
 (3, 0.89629567),
 (1, 0.54319745),
 (3, 0.59754455),
 (3, 0.74458706),
 (2, 0.70274407),
 (3, 0.48099133),
 (3, 0.9157489),
 (2, 0.9351875),
 (3, 0.4282914),
 (3, 0.914372),
 (3, 0.8346126),
 (1, 0.5575687),
 (3, 0.6654434),
 (3, 0.7499557),
 (0, 0.92286706),
 (0, 0.54407763),
 (0, 0.47874936),
 (1, 0.61793387),
 (0, 0.5700649),
 (3, 0.91843194),
 (2, 0.887218),
 (0, 0.9270326),
 (2, 0.7250447),
 (3, 0.9205566),
 (0, 0.7857545),
 (2, 0.87763476),
 (2, 0.7278991),
 (1, 0.56197554),
 (3, 0.89935756),
 (3, 0.55472434),
 (2, 0.61936015),
 (3, 0.7118576),
 (1, 0.88431555),
 (0, 0.6273293),
 (0, 0.45236757),
 (0, 0.33438915),
 (1, 0.615524),
 (2, 0.8247667),
 (3, 0.87029546),
 (1, 0.80924916),
 (0, 0.4380476),
 (2, 0.67450094),
 (3, 0.85154843),
 (2, 0.881508),
 (3, 0.46812874),
 (0, 0.52373886),
 (3, 0.62049866),
 (0, 0.89385676),
 (2, 0.501717),
 (0, 0.6874757),
 (1, 0.5305394),
 (0, 0.46585104),
 (2, 0.8902657),
 (1, 0.62546754),
 (3, 0.5223498),
 (2, 0.8982802),
 (1, 0.55581826),
 (2, 0.92221385),
 (1, 0.5623521),
 (2, 0.9096944),
 (2, 0.50951415),
 (0, 0.82179636),
 (1, 0.5751295),
 (3, 0.6939847),
 (0, 0.7757019),
 (3, 0.56087923),
 (0, 0.5157635),
 (0, 0.90739244),
 (1, 0.71233076),
 (2, 0.47673428),
 (3, 0.5730753),
 (1, 0.7768882),
 (3, 0.6953998),
 (2, 0.8819268),
 (0, 0.82515097),
 (2, 0.90401524),
 (3, 0.5335184),
 (1, 0.7652411),
 (2, 0.47280166),
 (2, 0.5445333),
 (1, 0.48507842),
 (3, 0.86154974),
 (2, 0.9144998),
 (3, 0.7558753),
 (0, 0.43348446),
 (1, 0.89086944),
 (3, 0.48158777),
 (2, 0.48802137),
 (2, 0.81246424),
 (1, 0.6405776),
 (3, 0.49302155),
 (2, 0.91366625),
 (2, 0.9219512),
 (3, 0.5380572),
 (0, 0.9090163),
 (3, 0.48833692),
 (1, 0.56991416),
 (0, 0.4312162),
 (3, 0.49621165),
 (0, 0.72620493),
 (3, 0.91265804),
 (0, 0.43661845),
 (3, 0.44025233),
 (1, 0.4742624),
 (2, 0.627134),
 (0, 0.7277269),
 (3, 0.53896767),
 (2, 0.79469985),
 (2, 0.9293187),
 (0, 0.70001495),
 (3, 0.8091993),
 (2, 0.52799493),
 (1, 0.5472206),
 (0, 0.88962454),
 (1, 0.4180909),
 (2, 0.7217199),
 (1, 0.7510072),
 (1, 0.71198595),
 (2, 0.6089336),
 (1, 0.6041778),
 (2, 0.8974997),
 (1, 0.50527966),
 (0, 0.89230937),
 (1, 0.5400127),
 (1, 0.6724064),
 (3, 0.88318694),
 (2, 0.8996155),
 (3, 0.8998039),
 (0, 0.5954972),
 (0, 0.4791535),
 (2, 0.5141959),
 (2, 0.5306326),
 (1, 0.5747227),
 (0, 0.73819655),
 (3, 0.9079146),
 (3, 0.7509242),
 (3, 0.8667775),
 (1, 0.7230621),
 (1, 0.88120776),
 (3, 0.80556625),
 (3, 0.465242),
 (0, 0.40124452),
 (0, 0.57915497),
 (2, 0.9222425),
 (2, 0.5460292),
 (0, 0.63652855),
 (3, 0.49488184),
 (1, 0.59966046),
 (3, 0.8712802),
 (2, 0.81246424),
 (0, 0.49170625),
 (3, 0.74850804),
 (2, 0.6635062),
 (3, 0.7301223),
 (0, 0.7040432),
 (0, 0.49196866),
 (1, 0.9154668),
 (1, 0.9023424),
 (2, 0.9014542),
 (0, 0.5235684),
 (0, 0.8211356),
 (3, 0.50892806),
 (3, 0.7499557),
 (1, 0.35090193),
 (2, 0.49336928),
 (1, 0.4297599),
 (3, 0.5422696),
 (1, 0.40502438),
 (1, 0.6123132),
 (0, 0.5380991),
 (0, 0.8948061),
 (3, 0.586728),
 (0, 0.9158777),
 (3, 0.89619005),
 (3, 0.91182554),
 (2, 0.8494504),
 (1, 0.373317),
 (0, 0.8890607),
 (0, 0.8926083),
 (1, 0.51002735),
 (3, 0.91871357),
 (1, 0.8008809),
 (0, 0.80531114),
 (2, 0.918521),
 (3, 0.9148392),
 (3, 0.93034154),
 (3, 0.5557078),
 (2, 0.62393236),
 (3, 0.50033534),
 (3, 0.4666169),
 (0, 0.6705074),
 (0, 0.9195954),
 (2, 0.8686464),
 (3, 0.7276412),
 (2, 0.688762),
 (1, 0.52656424),
 (0, 0.90382916),
 (3, 0.8646149),
 (3, 0.7836367),
 (0, 0.9031778),
 (1, 0.64568424),
 (1, 0.8912806),
 (3, 0.8834406),
 (3, 0.63996637),
 (3, 0.7900555),
 (2, 0.42324418),
 (2, 0.7107404),
 (3, 0.8872592),
 (2, 0.7085054),
 (1, 0.6126577),
 (2, 0.5082536),
 (3, 0.50365174),
 (2, 0.7389698),
 (3, 0.6219521),
 (0, 0.72066593),
 (0, 0.92872566),
 (0, 0.64916414),
 (3, 0.42796937),
 (0, 0.54055214),
 (3, 0.87387484),
 (3, 0.76155245),
 (1, 0.64689064),
 (0, 0.39760962),
 (3, 0.53770274),
 (1, 0.49462417),
 (3, 0.57165),
 (3, 0.6249277),
 (1, 0.90121824),
 (2, 0.48579594),
 (1, 0.59406304),
 (0, 0.86665314),
 (0, 0.56560487),
 (1, 0.58144903),
 (1, 0.7525008),
 (0, 0.8977813),
 (2, 0.91202366),
 (2, 0.9146705),
 (3, 0.5707251),
 (3, 0.75320774),
 (3, 0.6698248),
 (0, 0.37186992),
 (0, 0.88681763),
 (2, 0.80433685),
 (1, 0.6058497),
 (3, 0.69814247),
 (1, 0.36954597),
 (2, 0.5007212),
 (0, 0.869731),
 (3, 0.8565547),
 (1, 0.9199366),
 (1, 0.40478107),
 (0, 0.8407468),
 (3, 0.7131991),
 (2, 0.795091),
 (1, 0.5261346),
 (2, 0.80562925),
 (1, 0.4219866),
 (3, 0.81687385),
 (1, 0.7420972),
 (1, 0.6033693),
 (2, 0.81246424),
 (2, 0.5124688),
 (2, 0.92676103),
 (3, 0.53608394),
 (3, 0.54976565),
 (2, 0.5323369),
 (0, 0.92446256),
 (1, 0.73727214),
 (2, 0.8808426),
 (1, 0.6167363),
 (1, 0.8839309),
 (1, 0.9178796),
 (3, 0.91411245),
 (3, 0.55078),
 (3, 0.44680795),
 (1, 0.65793216),
 (2, 0.55987406),
 (0, 0.40985045),
 (0, 0.5785282),
 (0, 0.7271169),
 (1, 0.46365258),
 (3, 0.74673545),
 (1, 0.5071746),
 (3, 0.41669524),
 (3, 0.7818464),
 (1, 0.3750107),
 (3, 0.70438683),
 (0, 0.7304451),
 (1, 0.5646453),
 (3, 0.73604405),
 (1, 0.47361562),
 (0, 0.7764383),
 (0, 0.9042198),
 (0, 0.49923614),
 (3, 0.9097397),
 (2, 0.34987274),
 (3, 0.5707989),
 (2, 0.84731835),
 (3, 0.8846869),
 (1, 0.5696032),
 (2, 0.88721293),
 (0, 0.91471255),
 (3, 0.81057566),
 (3, 0.5781084),
 (3, 0.54527164),
 (0, 0.8401519),
 (1, 0.38541523),
 (1, 0.7851532),
 (3, 0.47386998),
 (1, 0.5578995),
 (1, 0.7598896),
 (0, 0.91234565),
 (0, 0.58683467),
 (2, 0.9150705),
 (2, 0.5953342),
 (1, 0.50759536),
 (0, 0.71087277),
 (3, 0.56578195),
 (3, 0.71901107),
 (2, 0.91622835),
 (2, 0.9020384),
 (2, 0.37335306),
 (0, 0.7858935),
 (0, 0.9287185),
 (3, 0.5701319),
 (1, 0.50840044),
 (0, 0.81242526),
 (3, 0.68124026),
 (2, 0.41806445),
 (1, 0.601824),
 (2, 0.5070009),
 (2, 0.8031434),
 (1, 0.91742754),
 (3, 0.5880752),
 (0, 0.89962703),
 (2, 0.68360513),
 (1, 0.6153999),
 (0, 0.87016284),
 (3, 0.8383668),
 (2, 0.6453448),
 (2, 0.7180495),
 (0, 0.62336516),
 (3, 0.62851155),
 (0, 0.90503985),
 (0, 0.8877521),
 (0, 0.47058535),
 (1, 0.74759567),
 (2, 0.64655596),
 (2, 0.46668655),
 (1, 0.9221722),
 (0, 0.7730295),
 (3, 0.74396926),
 (2, 0.77019566),
 (2, 0.59166235),
 (3, 0.542991),
 (3, 0.9303014),
 (0, 0.60511136),
 (2, 0.5649855),
 (0, 0.5864449),
 (0, 0.59099877),
 (1, 0.6998634),
 (3, 0.83247685),
 (0, 0.9230459),
 (0, 0.889683),
 (0, 0.55777586),
 (0, 0.61749816),
 (1, 0.9098562),
 (0, 0.91190004),
 (3, 0.7351781),
 (1, 0.84505117),
 (1, 0.55320466),
 (2, 0.6911069),
 (1, 0.5996425),
 (0, 0.61892015),
 (2, 0.6208746),
 (0, 0.505323),
 (3, 0.6237743),
 (0, 0.4987931),
 ...]
In [160]:
#Apply the above to a dataframe and rename the columns to topic and percentage contribution
doc_topics = pd.DataFrame(test_max)
doc_topics.columns = ['Topic','Percentage_Contribution']
doc_topics
Out[160]:
Topic Percentage_Contribution
0 1 0.551652
1 1 0.889989
2 1 0.914589
3 3 0.605850
4 2 0.718550
... ... ...
40260 1 0.725970
40261 3 0.731999
40262 3 0.403882
40263 3 0.527347
40264 3 0.295073

40265 rows × 2 columns

In [161]:
#Join back to original df newsarticlesdf4

newsarticlesdf5 = newsarticlesdf4.merge(doc_topics,left_index = True, right_index = True)
newsarticlesdf5.head()
Out[161]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime ... body_clean_stop polarity_headline polarity_body subjectivity_headline subjectivity_body compound VaderSentiment headline_clean_stop_stem Topic Percentage_Contribution
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 ... anger decision hrefhttpwwwtheguardiancomusnews... 0.000000 0.176356 0.0 0.598280 -0.2263 Negative tamir rice protest picket hous cleveland prose... 1 0.551652
1 Novel about Jewish-Palestinian love affair is ... A novel about a love affair between a Jewish w... 3797 614 2017-09-20T10:16:37Z theguardian.com 2016-01-01T01:04:07Z 2016-01-01 01:04:07 2016-01-01 01:04:07 ... novel love affair jewish woman palestinian man... 0.500000 0.115833 0.6 0.365417 0.7579 Positive novel jewishpalestinian love affair bar isra c... 1 0.889989
2 Former ABC managing director Brian Johns dies ... Former ABC managing director Brian Johns has d... 2475 412 2018-02-22T14:49:58Z theguardian.com 2016-01-01T01:22:12Z 2016-01-01 01:22:12 2016-01-01 01:22:12 ... former abc managing director brian johns died ... -0.050000 0.119396 0.2 0.368978 0.0000 Neutral former abc manag director brian john die age 79 1 0.914589
3 Hillary Clinton was told Angela Merkel is agai... Hillary Clinton was informed that German chanc... 6988 1191 2017-07-14T20:17:24Z theguardian.com 2016-01-01T04:36:07Z 2016-01-01 04:36:07 2016-01-01 04:36:07 ... hillary clinton informed german chancellor ang... 0.000000 0.029613 0.0 0.396207 0.0000 Neutral hillari clinton told angela merkel obama pheno... 3 0.605850
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 ... div idblock568611b9e4b0073bf25b8dfe classblock... 0.136364 0.148593 0.5 0.462881 -0.8658 Negative munich polic warn attack five seven suicid bom... 2 0.718550

5 rows × 28 columns

In [162]:
#Output the words associated with each topic
x = lda_model.show_topics()

nwords = {}
for topic,word in x:
    nwords[topic] = re.sub('[^A-Za-z ]+','',word)

nwords
Out[162]:
{0: 'brexit  happen  johnson  say  trump  labour  brief  impeach  uk  bori',
 1: 'polic  trump  attack  fire  court  die  man  death  offic  murder',
 2: 'us  say  stori  australia  guardian  blaze  crisi  climat  trump  paper',
 3: 'trump  happen  elect  us  bushfir  year  record  mail  uk  climat'}
In [163]:
#Apply above to a df to join to newsarticlesdf5
topic_words = pd.DataFrame(list(nwords.items()),columns=['Topic', 'Topic_Words'])
topic_words
Out[163]:
Topic Topic_Words
0 0 brexit happen johnson say trump labour b...
1 1 polic trump attack fire court die man d...
2 2 us say stori australia guardian blaze cr...
3 3 trump happen elect us bushfir year recor...
In [164]:
#Join to newsarticlesdf5 to get topic words and sort by PublishDate

newsarticlesdf6 = pd.merge(newsarticlesdf5,topic_words,left_on = 'Topic', right_on = 'Topic')
newsarticlesdf6.sort_values(by=['PublishDate'], inplace=True)
newsarticlesdf6.reset_index(drop = True, inplace = True)
newsarticlesdf6.head()
Out[164]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime ... polarity_headline polarity_body subjectivity_headline subjectivity_body compound VaderSentiment headline_clean_stop_stem Topic Percentage_Contribution Topic_Words
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 ... 0.000000 0.176356 0.00 0.598280 -0.2263 Negative tamir rice protest picket hous cleveland prose... 1 0.551652 polic trump attack fire court die man d...
1 A year in Putin portraits, from mocking memes ... It’s been a bumper year for President Vladimir... 4428 755 2018-08-18T15:00:05Z theguardian.com 2016-01-01T09:30:28Z 2016-01-01 09:30:28 2016-01-01 09:30:28 ... 0.000000 0.064583 0.00 0.501334 -0.4019 Negative year putin portrait mock meme edibl effigi 3 0.499467 trump happen elect us bushfir year recor...
2 UK new year events go ahead amid heightened se... Millions of Britons have taken part in officia... 5423 902 2017-09-20T23:30:31Z theguardian.com 2016-01-01T00:11:22Z 2016-01-01 00:11:22 2016-01-01 00:11:22 ... 0.000000 -0.014431 0.00 0.314299 0.3400 Positive uk year event ahead amid heighten secur around... 3 0.915110 trump happen elect us bushfir year recor...
3 Carly Fiorina tweets support for alma mater's ... The Republican presidential candidate Carly Fi... 1980 314 2017-07-14T20:17:16Z theguardian.com 2016-01-01T21:40:36Z 2016-01-01 21:40:36 2016-01-01 21:40:36 ... 0.600000 0.012222 0.95 0.473333 0.4019 Positive carli fiorina tweet support alma mater rose bo... 3 0.509295 trump happen elect us bushfir year recor...
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 ... 0.136364 0.148593 0.50 0.462881 -0.8658 Negative munich polic warn attack five seven suicid bom... 2 0.718550 us say stori australia guardian blaze cr...

5 rows × 29 columns

In [165]:
newsarticlesdf6.shape
Out[165]:
(40210, 29)
In [166]:
#Run Vader on topic words

topic_sentiment = newsarticlesdf6['Topic_Words'].apply(lambda x: analyzer.polarity_scores(x))
newsarticlesdf7 = pd.concat([newsarticlesdf6,topic_sentiment.apply(pd.Series)],1)
newsarticlesdf7.head()
Out[166]:
headline body charCount wordcount lastModified publication webPublicationDate PublicationDate PublishDate PublishTime ... compound VaderSentiment headline_clean_stop_stem Topic Percentage_Contribution Topic_Words neg neu pos compound
0 Tamir Rice protesters picket house of Clevelan... Anger over the decision <a href="http://www.th... 3704 627 2017-07-14T20:17:15Z theguardian.com 2016-01-01T22:44:12Z 2016-01-01 22:44:12 2016-01-01 22:44:12 ... -0.2263 Negative tamir rice protest picket hous cleveland prose... 1 0.551652 polic trump attack fire court die man d... 0.783 0.217 0.0 -0.9584
1 A year in Putin portraits, from mocking memes ... It’s been a bumper year for President Vladimir... 4428 755 2018-08-18T15:00:05Z theguardian.com 2016-01-01T09:30:28Z 2016-01-01 09:30:28 2016-01-01 09:30:28 ... -0.4019 Negative year putin portrait mock meme edibl effigi 3 0.499467 trump happen elect us bushfir year recor... 0.000 1.000 0.0 0.0000
2 UK new year events go ahead amid heightened se... Millions of Britons have taken part in officia... 5423 902 2017-09-20T23:30:31Z theguardian.com 2016-01-01T00:11:22Z 2016-01-01 00:11:22 2016-01-01 00:11:22 ... 0.3400 Positive uk year event ahead amid heighten secur around... 3 0.915110 trump happen elect us bushfir year recor... 0.000 1.000 0.0 0.0000
3 Carly Fiorina tweets support for alma mater's ... The Republican presidential candidate Carly Fi... 1980 314 2017-07-14T20:17:16Z theguardian.com 2016-01-01T21:40:36Z 2016-01-01 21:40:36 2016-01-01 21:40:36 ... 0.4019 Positive carli fiorina tweet support alma mater rose bo... 3 0.509295 trump happen elect us bushfir year recor... 0.000 1.000 0.0 0.0000
4 Munich police warned of attack by 'five to sev... <div id="block-568611b9e4b0073bf25b8dfe" class... 13445 2361 2016-12-30T11:58:39Z theguardian.com 2016-01-01T05:55:46Z 2016-01-01 05:55:46 2016-01-01 05:55:46 ... -0.8658 Negative munich polic warn attack five seven suicid bom... 2 0.718550 us say stori australia guardian blaze cr... 0.000 1.000 0.0 0.0000

5 rows × 33 columns

In [167]:
newsarticlesdf7.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40210 entries, 0 to 40209
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   headline                  40210 non-null  object        
 1   body                      40210 non-null  object        
 2   charCount                 40210 non-null  object        
 3   wordcount                 40210 non-null  object        
 4   lastModified              40210 non-null  object        
 5   publication               40210 non-null  object        
 6   webPublicationDate        40210 non-null  object        
 7   PublicationDate           40210 non-null  datetime64[ns]
 8   PublishDate               40210 non-null  datetime64[ns]
 9   PublishTime               40210 non-null  object        
 10  month                     40210 non-null  int64         
 11  Month full                40210 non-null  object        
 12  year                      40210 non-null  int64         
 13  headline_text_count       40210 non-null  int64         
 14  headline_char_count       40210 non-null  int64         
 15  headline_clean            40210 non-null  object        
 16  body_clean                40210 non-null  object        
 17  headline_clean_stop       40210 non-null  object        
 18  body_clean_stop           40210 non-null  object        
 19  polarity_headline         40210 non-null  float64       
 20  polarity_body             40210 non-null  float64       
 21  subjectivity_headline     40210 non-null  float64       
 22  subjectivity_body         40210 non-null  float64       
 23  compound                  40210 non-null  float64       
 24  VaderSentiment            40210 non-null  object        
 25  headline_clean_stop_stem  40210 non-null  object        
 26  Topic                     40210 non-null  int64         
 27  Percentage_Contribution   40210 non-null  float64       
 28  Topic_Words               40210 non-null  object        
 29  neg                       40210 non-null  float64       
 30  neu                       40210 non-null  float64       
 31  pos                       40210 non-null  float64       
 32  compound                  40210 non-null  float64       
dtypes: datetime64[ns](2), float64(10), int64(5), object(16)
memory usage: 10.1+ MB
In [168]:
newsarticlesdf8 = newsarticlesdf7[['PublishDate','compound']]
newsarticlesdf8.columns = ['PublishDate','HeadlineCompound','TopicCompound']
newsarticlesdf8.reset_index()
newsarticlesdf8.head(20)
Out[168]:
PublishDate HeadlineCompound TopicCompound
0 2016-01-01 -0.2263 -0.9584
1 2016-01-01 -0.4019 0.0000
2 2016-01-01 0.3400 0.0000
3 2016-01-01 0.4019 0.0000
4 2016-01-01 -0.8658 0.0000
5 2016-01-01 0.0000 0.0000
6 2016-01-01 -0.6249 0.0000
7 2016-01-01 0.0000 0.0000
8 2016-01-01 0.4767 0.0000
9 2016-01-01 0.7430 0.0000
10 2016-01-01 0.0000 0.0000
11 2016-01-01 0.0000 0.0000
12 2016-01-01 -0.4215 0.0000
13 2016-01-01 -0.2732 0.0000
14 2016-01-01 -0.7351 0.0000
15 2016-01-01 0.0000 0.0000
16 2016-01-01 0.7579 -0.9584
17 2016-01-01 0.0000 -0.9584
18 2016-01-01 -0.7184 -0.9584
19 2016-01-01 -0.3400 -0.9584
In [169]:
newsarticlesdf9 = newsarticlesdf8.groupby('PublishDate').mean()
newsarticlesdf9.head()
Out[169]:
HeadlineCompound TopicCompound
PublishDate
2016-01-01 -0.117233 -0.359400
2016-01-02 -0.146345 -0.087127
2016-01-03 -0.153812 -0.294892
2016-01-04 -0.215289 -0.085191
2016-01-05 -0.054081 -0.233124

Stock market data

Features to be added to the stock market data

  • Change exchange rate of S&P 500 index from USD to GBP
  • EWMA - Exponential Weighted Moving Average
  • Trading Strategy
In [170]:
#Join the ASX200 dataset and FXAud dataset in order to apply conversion to the S&P/ASX200 price to USD.

SP500Conversion  = pd.merge(SP500Time,FXUSDTime, on = 'Date')
SP500Conversion.columns = ['SP500_Close_USD','USD_Price','Chg%','Change %']
SP500Conversion['SP500_Close_GBP'] = SP500Conversion['SP500_Close_USD'] * SP500Conversion['USD_Price']
SP500Conversion.drop(labels = ['Chg%'], axis = 'columns', inplace = True)
SP500Conversion.head()
Out[170]:
SP500_Close_USD USD_Price Change % SP500_Close_GBP
Date
2016-01-01 2043.939941 1.4748 0.06% 3014.402626
2016-01-02 2043.939941 1.4748 NaN 3014.402626
2016-01-03 2043.939941 1.4748 NaN 3014.402626
2016-01-04 2012.660034 1.4718 -0.20% 2962.233038
2016-01-05 2016.709961 1.4672 -0.31% 2958.916855
In [171]:
SP500Conversion['SP500_Close_GBP'].plot(grid = True, label = "S&P 500")
ftseTime['Close'].plot(grid = True , label = "FTSE 100")

plt.ylabel('Close Price')
plt.title('S&P 500 & FTSE 100')
plt.legend()
plt.show()
In [173]:
#Exponential Weighted Moving Average using 253 days, indicating an exponentially weight moving average over a year.
#The average number of trading days per calendar year is 253 days [33]
ftseTime['EWMA'] = ftseTime['Close'].ewm(span=253).mean()
ftseTime.head()
Out[173]:
Close EWMA
Date
2016-01-04 6093.43 6093.430000
2016-01-05 6137.24 6115.421581
2016-01-06 6073.38 6101.296794
2016-01-07 5954.08 6064.055037
2016-01-08 5912.44 6033.250727
In [174]:
ftseTime[['Close','EWMA']].plot()
Out[174]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b09edb8390>

EWMA Strategy

  • When the price crosses the EWMA curve from above - short (sell) the index - close price < EWMA
  • When the price crosses the EWMA curve from below - long (buy) the index - close price > EWMA
In [175]:
ftseTime['Difference'] = ftseTime['Close'] - ftseTime['EWMA']
ftseTime.head()
Out[175]:
Close EWMA Difference
Date
2016-01-04 6093.43 6093.430000 0.000000
2016-01-05 6137.24 6115.421581 21.818419
2016-01-06 6073.38 6101.296794 -27.916794
2016-01-07 5954.08 6064.055037 -109.975037
2016-01-08 5912.44 6033.250727 -120.810727
In [176]:
def trading_strat(num):
    if num > 0:
        return "Long"
    elif num <0:
        return "Short"
    else:
        return "Hold"

ftseTime['Trading_Strategy'] = ftseTime['Difference'].apply(trading_strat)
ftseTime.head()
Out[176]:
Close EWMA Difference Trading_Strategy
Date
2016-01-04 6093.43 6093.430000 0.000000 Hold
2016-01-05 6137.24 6115.421581 21.818419 Long
2016-01-06 6073.38 6101.296794 -27.916794 Short
2016-01-07 5954.08 6064.055037 -109.975037 Short
2016-01-08 5912.44 6033.250727 -120.810727 Short
In [177]:
#Create new index for ftseTime df
ftseTime1 = ftseTime.reset_index()
ftseTime1.head()
Out[177]:
Date Close EWMA Difference Trading_Strategy
0 2016-01-04 6093.43 6093.430000 0.000000 Hold
1 2016-01-05 6137.24 6115.421581 21.818419 Long
2 2016-01-06 6073.38 6101.296794 -27.916794 Short
3 2016-01-07 5954.08 6064.055037 -109.975037 Short
4 2016-01-08 5912.44 6033.250727 -120.810727 Short
In [178]:
#Create new index for newsarticlesdf9 which contains sentiment analysis on headline and topic
newsarticlesdf10 = newsarticlesdf9.reset_index()
newsarticlesdf10.head()
Out[178]:
PublishDate HeadlineCompound TopicCompound
0 2016-01-01 -0.117233 -0.359400
1 2016-01-02 -0.146345 -0.087127
2 2016-01-03 -0.153812 -0.294892
3 2016-01-04 -0.215289 -0.085191
4 2016-01-05 -0.054081 -0.233124
In [179]:
ftseSentiment = pd.merge(ftseTime1,newsarticlesdf10,left_on = "Date", right_on = "PublishDate", how = "left")
ftseSentiment.head()
Out[179]:
Date Close EWMA Difference Trading_Strategy PublishDate HeadlineCompound TopicCompound
0 2016-01-04 6093.43 6093.430000 0.000000 Hold 2016-01-04 -0.215289 -0.085191
1 2016-01-05 6137.24 6115.421581 21.818419 Long 2016-01-05 -0.054081 -0.233124
2 2016-01-06 6073.38 6101.296794 -27.916794 Short 2016-01-06 -0.181614 -0.156473
3 2016-01-07 5954.08 6064.055037 -109.975037 Short 2016-01-07 -0.146893 -0.287520
4 2016-01-08 5912.44 6033.250727 -120.810727 Short 2016-01-08 -0.184410 -0.239600

Comments
The initial strategy looks at the Exponential Weighted Moving Average over a calendar to year decide whether to buy, hold or sell the index. Incorporating the news headlines sentiment will be looked into now.

The logic for the new trading strategy will be:

  • If the EWMA strategy states Long and both headline sentiment and topic sentiment are positive then buy the stock
  • If the EWMA strategy states Short and both headline sentiment and topic sentiment are negative the sell the stock
  • If the EWMA strategy states hold and both headline sentiment and topic sentiment are neutral then hold the stock
  • Otherwise hold the stock.
In [180]:
def strategy(s):
    if (s['Trading_Strategy']  == "Long") & (s['HeadlineCompound'] > 0):
        return "Buy"
    elif (s['Trading_Strategy'] == "Short") & (s['HeadlineCompound'] <0):
        return "Sell"
    else:
        return "Hold"

ftseSentiment['Sentiment_Strategy'] = ftseSentiment.apply(strategy,axis = 1)
ftseSentiment.head()
Out[180]:
Date Close EWMA Difference Trading_Strategy PublishDate HeadlineCompound TopicCompound Sentiment_Strategy
0 2016-01-04 6093.43 6093.430000 0.000000 Hold 2016-01-04 -0.215289 -0.085191 Hold
1 2016-01-05 6137.24 6115.421581 21.818419 Long 2016-01-05 -0.054081 -0.233124 Hold
2 2016-01-06 6073.38 6101.296794 -27.916794 Short 2016-01-06 -0.181614 -0.156473 Sell
3 2016-01-07 5954.08 6064.055037 -109.975037 Short 2016-01-07 -0.146893 -0.287520 Sell
4 2016-01-08 5912.44 6033.250727 -120.810727 Short 2016-01-08 -0.184410 -0.239600 Sell
In [181]:
ftseSentiment['Sentiment_Strategy'].unique()
Out[181]:
array(['Hold', 'Sell', 'Buy'], dtype=object)
In [182]:
#Make Date the index for the dataframe
ftseSentiment1 = ftseSentiment.set_index('Date')
ftseSentiment1.head()
Out[182]:
Close EWMA Difference Trading_Strategy PublishDate HeadlineCompound TopicCompound Sentiment_Strategy
Date
2016-01-04 6093.43 6093.430000 0.000000 Hold 2016-01-04 -0.215289 -0.085191 Hold
2016-01-05 6137.24 6115.421581 21.818419 Long 2016-01-05 -0.054081 -0.233124 Hold
2016-01-06 6073.38 6101.296794 -27.916794 Short 2016-01-06 -0.181614 -0.156473 Sell
2016-01-07 5954.08 6064.055037 -109.975037 Short 2016-01-07 -0.146893 -0.287520 Sell
2016-01-08 5912.44 6033.250727 -120.810727 Short 2016-01-08 -0.184410 -0.239600 Sell
In [183]:
#Normalize the close price and EWMA to plot headline sentiment values on same graph using standardscaler. 
ftseSentiment1[['Close','EWMA','HeadlineCompound',]] = StandardScaler().fit_transform(ftseSentiment[['Close','EWMA','HeadlineCompound']])
ftseSentiment1.head()
Out[183]:
Close EWMA Difference Trading_Strategy PublishDate HeadlineCompound TopicCompound Sentiment_Strategy
Date
2016-01-04 -2.208036 -1.933211 0.000000 Hold 2016-01-04 -1.013582 -0.085191 Hold
2016-01-05 -2.114223 -1.887388 21.818419 Long 2016-01-05 0.765850 -0.233124 Hold
2016-01-06 -2.250970 -1.916819 -27.916794 Short 2016-01-06 -0.641877 -0.156473 Sell
2016-01-07 -2.506434 -1.994418 -109.975037 Short 2016-01-07 -0.258614 -0.287520 Sell
2016-01-08 -2.595600 -2.058604 -120.810727 Short 2016-01-08 -0.672733 -0.239600 Sell
In [189]:
#Plot headlinecompound first to overlay the index values on top
ftseSentiment1[['HeadlineCompound','Close','EWMA']].plot(figsize = (21,15))
plt.title('FTSE 100 with 4 years EWMA and headline sentiment')
Out[189]:
Text(0.5,1,'FTSE 100 with 4 years EWMA and headline sentiment')

Comments
There is a lot of noise which can be seen above from the sentiment analysis on the news headlines. The lag of when news headlines are published and when this is incorporated in the price of the index can be seen. In mid 2018, a news headline had a high positive sentiment score and a few days later the stock price had increased. To improve on this, key words should be looked at to look at the impact on the stock market.

Q. The news articles have been publlished by The Guardian, but do these news articles have an effect on other Global indicies?

In [190]:
#Apply the EWMA with a period of 1 year similar to the FTSE 100 dataframe
SP500Conversion['EWMA'] = SP500Conversion['SP500_Close_USD'].ewm(span=253).mean()
SP500Conversion.head()
Out[190]:
SP500_Close_USD USD_Price Change % SP500_Close_GBP EWMA
Date
2016-01-01 2043.939941 1.4748 0.06% 3014.402626 2043.939941
2016-01-02 2043.939941 1.4748 NaN 3014.402626 2043.939941
2016-01-03 2043.939941 1.4748 NaN 3014.402626 2043.939941
2016-01-04 2012.660034 1.4718 -0.20% 2962.233038 2036.026994
2016-01-05 2016.709961 1.4672 -0.31% 2958.916855 2032.102266
In [191]:
SP500Conversion[['SP500_Close_USD','EWMA']].plot()
Out[191]:
<matplotlib.axes._subplots.AxesSubplot at 0x1b0a193e048>
In [192]:
SP500Conversion['Difference'] = SP500Conversion['SP500_Close_USD'] - SP500Conversion['EWMA']
SP500Conversion.head()
Out[192]:
SP500_Close_USD USD_Price Change % SP500_Close_GBP EWMA Difference
Date
2016-01-01 2043.939941 1.4748 0.06% 3014.402626 2043.939941 0.000000
2016-01-02 2043.939941 1.4748 NaN 3014.402626 2043.939941 0.000000
2016-01-03 2043.939941 1.4748 NaN 3014.402626 2043.939941 0.000000
2016-01-04 2012.660034 1.4718 -0.20% 2962.233038 2036.026994 -23.366960
2016-01-05 2016.709961 1.4672 -0.31% 2958.916855 2032.102266 -15.392305
In [193]:
def trading_strat(num):
    if num > 0:
        return "Long"
    elif num <0:
        return "Short"
    else:
        return "Hold"

SP500Conversion['Trading_Strategy'] = SP500Conversion['Difference'].apply(trading_strat)
SP500Conversion.head()
Out[193]:
SP500_Close_USD USD_Price Change % SP500_Close_GBP EWMA Difference Trading_Strategy
Date
2016-01-01 2043.939941 1.4748 0.06% 3014.402626 2043.939941 0.000000 Hold
2016-01-02 2043.939941 1.4748 NaN 3014.402626 2043.939941 0.000000 Hold
2016-01-03 2043.939941 1.4748 NaN 3014.402626 2043.939941 0.000000 Hold
2016-01-04 2012.660034 1.4718 -0.20% 2962.233038 2036.026994 -23.366960 Short
2016-01-05 2016.709961 1.4672 -0.31% 2958.916855 2032.102266 -15.392305 Short
In [194]:
#Add sentiment scores to the stock data 
# steps - reset index for both ftse and news articles df 
#Join on date
#plot sentiment against the time series plot

#Create new index for ftseTime df
SP500Conversion1 = SP500Conversion.reset_index()
SP500Conversion1.head()
Out[194]:
Date SP500_Close_USD USD_Price Change % SP500_Close_GBP EWMA Difference Trading_Strategy
0 2016-01-01 2043.939941 1.4748 0.06% 3014.402626 2043.939941 0.000000 Hold
1 2016-01-02 2043.939941 1.4748 NaN 3014.402626 2043.939941 0.000000 Hold
2 2016-01-03 2043.939941 1.4748 NaN 3014.402626 2043.939941 0.000000 Hold
3 2016-01-04 2012.660034 1.4718 -0.20% 2962.233038 2036.026994 -23.366960 Short
4 2016-01-05 2016.709961 1.4672 -0.31% 2958.916855 2032.102266 -15.392305 Short
In [195]:
SP500Sentiment = pd.merge(SP500Conversion1,newsarticlesdf10,left_on = "Date", right_on = "PublishDate", how = "left")
SP500Sentiment.head()
Out[195]:
Date SP500_Close_USD USD_Price Change % SP500_Close_GBP EWMA Difference Trading_Strategy PublishDate HeadlineCompound TopicCompound
0 2016-01-01 2043.939941 1.4748 0.06% 3014.402626 2043.939941 0.000000 Hold 2016-01-01 -0.117233 -0.359400
1 2016-01-02 2043.939941 1.4748 NaN 3014.402626 2043.939941 0.000000 Hold 2016-01-02 -0.146345 -0.087127
2 2016-01-03 2043.939941 1.4748 NaN 3014.402626 2043.939941 0.000000 Hold 2016-01-03 -0.153812 -0.294892
3 2016-01-04 2012.660034 1.4718 -0.20% 2962.233038 2036.026994 -23.366960 Short 2016-01-04 -0.215289 -0.085191
4 2016-01-05 2016.709961 1.4672 -0.31% 2958.916855 2032.102266 -15.392305 Short 2016-01-05 -0.054081 -0.233124
In [196]:
#Make Date the index for the dataframe
SP500Sentiment1 = SP500Sentiment.set_index('Date')
SP500Sentiment1.head()
Out[196]:
SP500_Close_USD USD_Price Change % SP500_Close_GBP EWMA Difference Trading_Strategy PublishDate HeadlineCompound TopicCompound
Date
2016-01-01 2043.939941 1.4748 0.06% 3014.402626 2043.939941 0.000000 Hold 2016-01-01 -0.117233 -0.359400
2016-01-02 2043.939941 1.4748 NaN 3014.402626 2043.939941 0.000000 Hold 2016-01-02 -0.146345 -0.087127
2016-01-03 2043.939941 1.4748 NaN 3014.402626 2043.939941 0.000000 Hold 2016-01-03 -0.153812 -0.294892
2016-01-04 2012.660034 1.4718 -0.20% 2962.233038 2036.026994 -23.366960 Short 2016-01-04 -0.215289 -0.085191
2016-01-05 2016.709961 1.4672 -0.31% 2958.916855 2032.102266 -15.392305 Short 2016-01-05 -0.054081 -0.233124
In [197]:
#Normalize the close price and EWMA to plot headline sentiment values on same graph using standardscaler. 
SP500Sentiment1[['SP500_Close_USD','EWMA','HeadlineCompound']] = StandardScaler().fit_transform(SP500Sentiment1[['SP500_Close_USD','EWMA','HeadlineCompound']])
SP500Sentiment1.head()
Out[197]:
SP500_Close_USD USD_Price Change % SP500_Close_GBP EWMA Difference Trading_Strategy PublishDate HeadlineCompound TopicCompound
Date
2016-01-01 -1.515489 1.4748 0.06% 3014.402626 -1.335382 0.000000 Hold 2016-01-01 0.069190 -0.359400
2016-01-02 -1.515489 1.4748 NaN 3014.402626 -1.335382 0.000000 Hold 2016-01-02 -0.252464 -0.087127
2016-01-03 -1.515489 1.4748 NaN 3014.402626 -1.335382 0.000000 Hold 2016-01-03 -0.334956 -0.294892
2016-01-04 -1.609226 1.4718 -0.20% 2962.233038 -1.360550 -23.366960 Short 2016-01-04 -1.014207 -0.085191
2016-01-05 -1.597090 1.4672 -0.31% 2958.916855 -1.373033 -15.392305 Short 2016-01-05 0.766947 -0.233124
In [199]:
#Plot headlinecompound first to overlay the index values on top
SP500Sentiment1[['HeadlineCompound','SP500_Close_USD','EWMA']].plot(figsize = (20,15))
plt.title('S&P 500 with 4 years EWMA and headline sentiment')
Out[199]:
Text(0.5,1,'S&P 500 with 4 years EWMA and headline sentiment')

Comments
There is a considerable amount of noise present in the news headline sentiment. However, there are some trends such as in 2018 the stock price hit a high point and there was a positive news sentiment.

Q. Is there any correlation between news headline sentiment and stock price?

In [200]:
plt.subplots(figsize = (10,6))
graph1 = sns.heatmap(ftseSentiment.corr(method = 'spearman'),cmap = 'coolwarm',annot = True)
plt.title('Correlation Heatmap of FTSE 100 and Headline Sentiment')
Out[200]:
Text(0.5,1,'Correlation Heatmap of FTSE 100 and Headline Sentiment')
In [201]:
plt.subplots(figsize = (10,6))
graph1 = sns.heatmap(SP500Sentiment.corr(method = 'spearman'),cmap = 'coolwarm',annot = True)
plt.title('Correlation Heatmap of S&P 500 and Headline Sentiment')
Out[201]:
Text(0.5,1,'Correlation Heatmap of S&P 500 and Headline Sentiment')

Comments
Spearman correlation is used to measure the degree of association between two variables. There is a weak negative correlation between the headline sentiment and closing price of the stock index for both FTSE 100 and S&P500.

Q. Can news headline sentiment be used to predict the stock index values?

Multiple Linear Regression
In [202]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
In [203]:
ftseSentiment.head()
Out[203]:
Date Close EWMA Difference Trading_Strategy PublishDate HeadlineCompound TopicCompound Sentiment_Strategy
0 2016-01-04 6093.43 6093.430000 0.000000 Hold 2016-01-04 -0.215289 -0.085191 Hold
1 2016-01-05 6137.24 6115.421581 21.818419 Long 2016-01-05 -0.054081 -0.233124 Hold
2 2016-01-06 6073.38 6101.296794 -27.916794 Short 2016-01-06 -0.181614 -0.156473 Sell
3 2016-01-07 5954.08 6064.055037 -109.975037 Short 2016-01-07 -0.146893 -0.287520 Sell
4 2016-01-08 5912.44 6033.250727 -120.810727 Short 2016-01-08 -0.184410 -0.239600 Sell
In [204]:
#Format date to ordinal to run through linear regression model as linear regression model does not accept date as datetime format
import datetime as dt
ftseSentiment['DateOrdinal'] = ftseSentiment['Date'].map(dt.datetime.toordinal)
ftseSentiment['PublishDateOrdinal'] = ftseSentiment['PublishDate'].map(dt.datetime.toordinal)
ftseSentiment.head()
Out[204]:
Date Close EWMA Difference Trading_Strategy PublishDate HeadlineCompound TopicCompound Sentiment_Strategy DateOrdinal PublishDateOrdinal
0 2016-01-04 6093.43 6093.430000 0.000000 Hold 2016-01-04 -0.215289 -0.085191 Hold 735967 735967
1 2016-01-05 6137.24 6115.421581 21.818419 Long 2016-01-05 -0.054081 -0.233124 Hold 735968 735968
2 2016-01-06 6073.38 6101.296794 -27.916794 Short 2016-01-06 -0.181614 -0.156473 Sell 735969 735969
3 2016-01-07 5954.08 6064.055037 -109.975037 Short 2016-01-07 -0.146893 -0.287520 Sell 735970 735970
4 2016-01-08 5912.44 6033.250727 -120.810727 Short 2016-01-08 -0.184410 -0.239600 Sell 735971 735971
In [205]:
X = ftseSentiment[['DateOrdinal','PublishDateOrdinal','HeadlineCompound']]
y = ftseSentiment['Close']
In [206]:
X.head()
Out[206]:
DateOrdinal PublishDateOrdinal HeadlineCompound
0 735967 735967 -0.215289
1 735968 735968 -0.054081
2 735969 735969 -0.181614
3 735970 735970 -0.146893
4 735971 735971 -0.184410
In [207]:
y.head()
Out[207]:
0    6093.43
1    6137.24
2    6073.38
3    5954.08
4    5912.44
Name: Close, dtype: float64
In [208]:
#Split the dataframe into training and test sets. Code for MLR adapted from StackAbuse.com[35]
#Enter a value for random state for reproducibility
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 42)
In [209]:
#Create MLR
from sklearn.linear_model import LinearRegression
mlr_mod = LinearRegression()
In [210]:
X_train.fillna(X_train.mean(),inplace = True)
X_test.fillna(X_test.mean(),inplace = True)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py:6245: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
In [211]:
#Fit the model
mlr_mod.fit(X_train,y_train)
Out[211]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [212]:
#make predictions on the test set
pred_price = mlr_mod.predict(X_test)
In [213]:
pred_price[1]
Out[213]:
7241.817136594036
In [214]:
#compute RMSE and R2 statistics
test_set_rmse = (np.sqrt(mean_squared_error(y_test,pred_price)))
test_set_r2 = r2_score(y_test,pred_price)
In [215]:
print(test_set_rmse)
365.04806196143477
In [216]:
print(test_set_r2)
0.3047118989660077
In [217]:
prediction_values = pd.DataFrame({'Actual':y_test,'Predicted':pred_price})
prediction_values.head()
Out[217]:
Actual Predicted
1320 7117.15 7595.329994
836 7328.92 7241.817137
413 7299.86 6891.650241
522 7527.33 6976.551665
1035 7094.12 7318.780345
In [218]:
X_test
Out[218]:
DateOrdinal PublishDateOrdinal HeadlineCompound
1320 737287 737287 -0.334366
836 736803 736803 -0.268987
413 736380 736380 -0.078209
522 736489 736489 -0.110082
1035 737002 737002 -0.074090
... ... ... ...
331 736298 736298 -0.068821
323 736290 736290 -0.135950
649 736616 736616 -0.364308
439 736406 736406 -0.157917
974 736941 736941 -0.232809

438 rows × 3 columns

In [219]:
y_test
Out[219]:
1320    7117.15
836     7328.92
413     7299.86
522     7527.33
1035    7094.12
         ...   
331     6783.79
323     6819.72
649     7535.44
439     7424.96
974     7457.86
Name: Close, Length: 438, dtype: float64
In [220]:
#Merge predicted values to the X_test by index
predicted_test_set = X_test.merge(prediction_values,left_index = True, right_index = True)
predicted_test_set.reset_index(drop = True, inplace = True)
predicted_test_set.head()
Out[220]:
DateOrdinal PublishDateOrdinal HeadlineCompound Actual Predicted
0 737287 737287 -0.334366 7117.15 7595.329994
1 736803 736803 -0.268987 7328.92 7241.817137
2 736380 736380 -0.078209 7299.86 6891.650241
3 736489 736489 -0.110082 7527.33 6976.551665
4 737002 737002 -0.074090 7094.12 7318.780345
In [221]:
dt = datetime.fromordinal(736390)
dt
Out[221]:
datetime.datetime(2017, 3, 2, 0, 0)
In [222]:
#Change Date column from ordinal to datetime to plot
predicted_test_set['NewDate'] = predicted_test_set['DateOrdinal'].map(date.fromordinal)
predicted_test_set.head()
Out[222]:
DateOrdinal PublishDateOrdinal HeadlineCompound Actual Predicted NewDate
0 737287 737287 -0.334366 7117.15 7595.329994 2019-08-16
1 736803 736803 -0.268987 7328.92 7241.817137 2018-04-19
2 736380 736380 -0.078209 7299.86 6891.650241 2017-02-20
3 736489 736489 -0.110082 7527.33 6976.551665 2017-06-09
4 737002 737002 -0.074090 7094.12 7318.780345 2018-11-04
In [223]:
#Sort the df by publish date and reset the index
predicted_test_set.sort_values(by = 'NewDate',inplace = True)
predicted_test_set.reset_index(drop = True, inplace = True)
predicted_test_set.head()
Out[223]:
DateOrdinal PublishDateOrdinal HeadlineCompound Actual Predicted NewDate
0 735977 735977 -0.166687 5918.23 6641.367743 2016-01-14
1 735982 735982 -0.153994 5876.80 6640.897474 2016-01-19
2 735990 735990 -0.116440 5990.37 6634.827524 2016-01-27
3 735996 735996 0.051156 5922.01 6587.281438 2016-02-02
4 735997 735997 -0.173681 5837.14 6657.299255 2016-02-03
In [224]:
#Set NewDate as the index
predicted_test_set1 = predicted_test_set.set_index('NewDate')
predicted_test_set1.head()
Out[224]:
DateOrdinal PublishDateOrdinal HeadlineCompound Actual Predicted
NewDate
2016-01-14 735977 735977 -0.166687 5918.23 6641.367743
2016-01-19 735982 735982 -0.153994 5876.80 6640.897474
2016-01-27 735990 735990 -0.116440 5990.37 6634.827524
2016-02-02 735996 735996 0.051156 5922.01 6587.281438
2016-02-03 735997 735997 -0.173681 5837.14 6657.299255
In [225]:
predicted_test_set[['Actual','Predicted']].plot(figsize = (10,6))
plt.title('Actual vs Predicted Closing Price values of FTSE 100 using Multiple Linear Regression')
Out[225]:
Text(0.5,1,'Actual vs Predicted Closing Price values of FTSE 100 using Multiple Linear Regression')

Comments
Although the predicted values follow the same trend as the actual closing price values, the linear regression model does not capture the sudden drops of the price such as during the financial crisis. If the linear regression model was implemented to predict stock prices, heavy losses would have been suffered during the financial crisis.

Random Forest Regression

In [226]:
from sklearn.ensemble.forest import RandomForestRegressor
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:144: FutureWarning: The sklearn.ensemble.forest module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.ensemble. Anything that cannot be imported from sklearn.ensemble is now part of the private API.
  warnings.warn(message, FutureWarning)
In [227]:
#Code for Random Forest adapted from PythonData.com [36]
RF_Model = RandomForestRegressor(n_estimators = 100, oob_score = True)
In [228]:
#Fit the model
randomforest = RF_Model.fit(X_train,y_train)
In [229]:
rf_test_pred = RF_Model.predict(X_test)
In [230]:
rf_test_pred[1]
Out[230]:
7310.564900000005
In [231]:
rf_test_set_rmse = (np.sqrt(mean_squared_error(y_test,rf_test_pred)))
rf_test_set_r2 = r2_score(y_test,rf_test_pred)
In [232]:
print(rf_test_set_rmse)
42.32131936659726
In [233]:
print(rf_test_set_r2)
0.9906549183651873
In [234]:
rf_prediction_values = pd.DataFrame({'Actual':y_test,'Predicted':rf_test_pred})
rf_prediction_values.head()
Out[234]:
Actual Predicted
1320 7117.15 7112.8711
836 7328.92 7310.5649
413 7299.86 7287.6011
522 7527.33 7488.0839
1035 7094.12 7087.4386
In [235]:
#Merge predicted values to the X_test by index
rf_predicted_test_set = X_test.merge(rf_prediction_values,left_index = True, right_index = True)
rf_predicted_test_set.reset_index(drop = True, inplace = True)
rf_predicted_test_set.head()
Out[235]:
DateOrdinal PublishDateOrdinal HeadlineCompound Actual Predicted
0 737287 737287 -0.334366 7117.15 7112.8711
1 736803 736803 -0.268987 7328.92 7310.5649
2 736380 736380 -0.078209 7299.86 7287.6011
3 736489 736489 -0.110082 7527.33 7488.0839
4 737002 737002 -0.074090 7094.12 7087.4386
In [236]:
#Change Date column from ordinal to datetime to plot
rf_predicted_test_set['NewDate'] = rf_predicted_test_set['DateOrdinal'].map(date.fromordinal)
rf_predicted_test_set.head()
Out[236]:
DateOrdinal PublishDateOrdinal HeadlineCompound Actual Predicted NewDate
0 737287 737287 -0.334366 7117.15 7112.8711 2019-08-16
1 736803 736803 -0.268987 7328.92 7310.5649 2018-04-19
2 736380 736380 -0.078209 7299.86 7287.6011 2017-02-20
3 736489 736489 -0.110082 7527.33 7488.0839 2017-06-09
4 737002 737002 -0.074090 7094.12 7087.4386 2018-11-04
In [237]:
#Sort the df by publish date and reset the index
rf_predicted_test_set.sort_values(by = 'NewDate',inplace = True)
rf_predicted_test_set.reset_index(drop = True, inplace = True)
rf_predicted_test_set.head()
Out[237]:
DateOrdinal PublishDateOrdinal HeadlineCompound Actual Predicted NewDate
0 735977 735977 -0.166687 5918.23 5903.8235 2016-01-14
1 735982 735982 -0.153994 5876.80 5777.9653 2016-01-19
2 735990 735990 -0.116440 5990.37 5896.6196 2016-01-27
3 735996 735996 0.051156 5922.01 6028.2346 2016-02-02
4 735997 735997 -0.173681 5837.14 5952.3202 2016-02-03
In [ ]:
 
In [242]:
rf_predicted_test_set[['Actual','Predicted']].plot(figsize = (10,5))
plt.title('Actual vs Predicted Closing Price values of FTSE 100 using Random Forest Regression')
Out[242]:
Text(0.5,1,'Actual vs Predicted Closing Price values of FTSE 100 using Random Forest Regression')

Comments
The Random Forest Regression model follows the upward trend of the FTSE 100 close price. Providing better way to predict. The RF model has a R2 statistic of 0.99689 indicating that the model has overfitted.